From 24c2ae1128c4f494434a1190397e8d5f84d9f5ce Mon Sep 17 00:00:00 2001 From: Kundrata Date: Mon, 12 Mar 2018 20:48:35 +0200 Subject: [PATCH 01/77] Correct api.json information An attempt to fix issue 1084: https://github.com/fireice-uk/xmr-stak/issues/1084 --- xmrstak/misc/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index c4ba26ead..473612cec 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -1223,7 +1223,7 @@ void executor::http_json_report(std::string& out) if(i != 0) cn_error.append(1, ','); snprintf(buffer, sizeof(buffer), sJsonApiConnectionError, - int_port(duration_cast(vMineResults[i].time.time_since_epoch()).count()), + int_port(duration_cast(vSocketLog[i].time.time_since_epoch()).count()), vSocketLog[i].msg.c_str()); cn_error.append(buffer); } From 038c88efbcc75d29f62d6f32bc2d36269a867a4c Mon Sep 17 00:00:00 2001 From: maurezen Date: Tue, 5 Jun 2018 08:40:34 +0300 Subject: [PATCH 02/77] Update compile_Linux.md RAM requirement mention in compile_linux --- doc/compile_Linux.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md index 072402ff7..2e45b21dd 100644 --- a/doc/compile_Linux.md +++ b/doc/compile_Linux.md @@ -105,6 +105,8 @@ In that case you can force CUDA to use an older compiler in the following way: cmake -DCUDA_HOST_COMPILER=/usr/bin/gcc-5 .. ``` +- You need 1 Gb RAM to compile (a bit less might be enough, 512 Mb isn't). + ### To do a generic and static build for a system without gcc 5.1+ ``` cmake -DCMAKE_LINK_STATIC=ON -DXMR-STAK_COMPILE=generic . From 145ac6ffb53d5b4ec11b464523c363f8302a8daf Mon Sep 17 00:00:00 2001 From: maurezen Date: Tue, 5 Jun 2018 09:16:06 +0300 Subject: [PATCH 03/77] Update FAQ.md Internal compiler error on low-RAM machines --- doc/FAQ.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/FAQ.md b/doc/FAQ.md index aa6fb8959..a8adf064f 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -11,6 +11,7 @@ * [How can I mine Monero](#how-can-i-mine-monero) * [Why is Monero named monero7](#why-is-monero-named-monero7) * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed) +* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error) ## "Obtaining SeLockMemoryPrivilege failed." @@ -87,3 +88,7 @@ To avoid configuration conflicts after the hard fork of Monero to the new POW wi If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm. Select one of these generic coin algorithms. + +## Internal compiler error + +Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04). From bd81795eeedaafe2fa7752b9919cd8149dd2d408 Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Sun, 6 May 2018 21:11:30 +0200 Subject: [PATCH 04/77] adding qrl as a config option --- README.md | 1 + xmrstak/jconf.cpp | 1 + xmrstak/pools.tpl | 1 + 3 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 887bc5cf3..ff4b8d3ad 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [Haven](https://havenprotocol.com) - [Intense](https://intensecoin.com) - [Masari](https://getmasari.org) +- [QRL](https://theqrl.org) - [Ryo](https://ryo-currency.com) - [TurtleCoin](https://turtlecoin.lol) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 354388849..b6580ea9a 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -105,6 +105,7 @@ xmrstak::coin_selection coins[] = { { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, + { "qrl", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "turtlecoin", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr } diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 6960d63bb..78f2315ac 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -28,6 +28,7 @@ POOLCONF], * intense * masari * monero7 (use this for Monero's new PoW) + * qrl - Quantum Resistant Ledger * ryo * turtlecoin * From 0e1193d93b09c55b7215ec35dfb4b226cd7bf4bd Mon Sep 17 00:00:00 2001 From: Cheran Date: Wed, 25 Jul 2018 12:04:10 +0530 Subject: [PATCH 05/77] Fix Spelling changed chose to choose --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 887bc5cf3..ec2a22b20 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [Ryo](https://ryo-currency.com) - [TurtleCoin](https://turtlecoin.lol) -If your prefered coin is not listed, you can chose one of the following algorithms: +If your prefered coin is not listed, you can choose one of the following algorithms: - 1MiB scratchpad memory - cryptonight_lite From cbe03f7e6f2806f15ac726d261ccf38b8304a787 Mon Sep 17 00:00:00 2001 From: Cheran Date: Sun, 29 Jul 2018 17:14:48 +0530 Subject: [PATCH 06/77] Fix Grammar --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 327c4e9f8..e3b01328a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ###### fireice-uk's and psychocrypt's # XMR-Stak - Cryptonight All-in-One Mining Software -XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus and can be used to mine the crypto currencys Monero, Aeon and many more Cryptonight coins. +XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins. ## HTML reports @@ -28,7 +28,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV - supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON) - easy to use - guided start (no need to edit a config file for the first start) - - auto configuration for each backend + - auto-configuration for each backend - open source software (GPLv3) - TLS support - [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton) @@ -64,7 +64,7 @@ If your prefered coin is not listed, you can choose one of the following algorit - cryptonight_haven - cryptonight_heavy -Please note, this list is not complete, and is not an endorsement. +Please note, this list is not complete and is not an endorsement. ## Download @@ -72,7 +72,7 @@ You can find the latest releases and precompiled binaries on GitHub under [Relea ## Default Developer Donation -By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries. +By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries. If you want to donate directly to support further development, here is my wallet From 63384f4cf6915bfe98047686b622e93c8d0e6b58 Mon Sep 17 00:00:00 2001 From: JokerGermany <30293477+JokerGermany@users.noreply.github.com> Date: Fri, 3 Aug 2018 20:06:41 +0200 Subject: [PATCH 07/77] Download Link for AMD APP SDK 3.0 fixed --- doc/compile_Windows.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index 802d5c5ab..add5fbfd0 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -32,8 +32,8 @@ ### AMD APP SDK 3.0 (only needed for AMD GPUs) -- Download and install the latest version from https://www.dropbox.com/s/gq8vqhelq0m6gj4/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe - (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855) +- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe + (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855) ### Dependencies OpenSSL/Hwloc and Microhttpd - For CUDA 8*: From 43fa697cfa8b63d1fbc63c363b37252141fd84b6 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Wed, 8 Aug 2018 06:45:52 -0600 Subject: [PATCH 08/77] Add detail to CUDA detections, better for issue reports --- .../backend/nvidia/nvcc_code/cuda_extra.cu | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 3b049ace8..b455f55ca 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -450,19 +450,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if(version < CUDART_VERSION) { - printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); + printf("WARNING: Driver supports CUDA %d.%d but this was compiled for CUDA %d.%d API! Update your nVidia driver or compile with older CUDA!\n", + version / 1000, (version % 1000 / 10), + CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); return 1; } int GPU_N; if(cuda_get_devicecount(&GPU_N) == 0) { + printf("WARNING: CUDA claims zero devices?\n"); return 1; } if(ctx->device_id >= GPU_N) { - printf("Invalid device ID!\n"); + printf("WARNING: Invalid device ID '%i'!\n", ctx->device_id); return 1; } @@ -483,6 +486,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->name = std::string(props.name); + printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ", + version / 1000, (version % 1000 / 10), + CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10, + ctx->device_id, gpuArch, ctx->device_name); + std::vector arch; #define XMRSTAK_PP_TOSTRING1(str) #str #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str) @@ -496,13 +504,14 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) while ( ss >> tmpArch ) arch.push_back( tmpArch ); + #define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n" if(gpuArch >= 20 && gpuArch < 30) { // compiled binary must support sm_20 for fermi std::vector::iterator it = std::find(arch.begin(), arch.end(), 20); if(it == arch.end()) { - printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch); + printf(MSG_CUDA_NO_ARCH); return 5; } } @@ -520,7 +529,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) minSupportedArch = arch[i]; if(minSupportedArch < 30 || gpuArch < minSupportedArch) { - printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch); + printf(MSG_CUDA_NO_ARCH); return 5; } } @@ -529,8 +538,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if(ctx->device_blocks == -1) { /* good values based of my experience - * - 3 * SMX count >=sm_30 - * - 2 * SMX count for =sm_30 + * - 2 * SMX count for device_blocks = props.multiProcessorCount * ( props.major < 3 ? 2 : 3 ); @@ -582,18 +591,19 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) int* tmp; cudaError_t err; + #define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n" // a device must be selected to get the right memory usage later on err = cudaSetDevice(ctx->device_id); if(err != cudaSuccess) { - printf("WARNING: NVIDIA GPU %d: cannot be selected.\n", ctx->device_id); + printf(MSG_CUDA_FUNC_FAIL, "cudaSetDevice"); return 2; } // trigger that a context on the gpu will be allocated err = cudaMalloc(&tmp, 256); if(err != cudaSuccess) { - printf("WARNING: NVIDIA GPU %d: context cannot be created.\n", ctx->device_id); + printf(MSG_CUDA_FUNC_FAIL, "cudaMalloc"); return 3; } @@ -626,9 +636,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) size_t usedMem = totalMemory - freeMemory; if(usedMem >= maxMemUsage) { - printf("WARNING: NVIDIA GPU %d: already %s MiB memory in use, skip GPU.\n", - ctx->device_id, - std::to_string(usedMem/byteToMiB).c_str()); + printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str()); return 4; } else @@ -661,6 +669,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } } + printf("device init succeeded\n"); return 0; } From 1eb199a53c3ef796da56ea259251c0b91908764d Mon Sep 17 00:00:00 2001 From: jefferson-1 Date: Sat, 18 Aug 2018 21:55:06 -0500 Subject: [PATCH 09/77] Incorrect Grammar fix Improve the grammar of the Dev donation setting. --- xmrstak/donate-level.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/xmrstak/donate-level.hpp b/xmrstak/donate-level.hpp index 71b79628a..0f851f8f8 100644 --- a/xmrstak/donate-level.hpp +++ b/xmrstak/donate-level.hpp @@ -1,18 +1,19 @@ #pragma once /* - * Dev donation. - * Percentage of your hashing power that you want to donate to the developer, can be 0.0 if you don't want to do that. - * Example of how it works for the default setting of 2.0: - * You miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minute. - * Switching is instant, and only happens after a successful connection, so you never loose any hashes. + * DEV DONATION SETTING + * This setting is a percentage of your hashing power that the miner donates to the developers of this app. + * It can be 0.0 if you don't want to help the developers. The default setting of 2.0 means that + * the miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minutes. + * Switching pools is instant and it only happens after a successful connection, so you don't lose any hash time. * - * If you plan on changing this setting to 0.0 please consider making a one off donation to our wallets: + * If you plan on changing this setting to 0.0, please consider making a one time donation to our wallets: * fireice-uk: * 4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk * psychocrypt: * 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU * + * Thank you for your support. */ constexpr double fDevDonationLevel = 2.0 / 100.0; From 69628078c50727e374f425d846a7e0b7997d0405 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 28 Aug 2018 20:29:57 +0200 Subject: [PATCH 10/77] add self test hashes The most algorithm currently are not checked in the cpu self test function. - add hash for each algorithm --- xmrstak/backend/cpu/minethd.cpp | 226 +++++++++++++++++++++----------- 1 file changed, 149 insertions(+), 77 deletions(-) diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 2e7169ef7..a8452ebb1 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -232,92 +232,164 @@ bool minethd::self_test() bool bResult = true; - if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight) - { - unsigned char out[32 * MAX_N]; - cn_hash_fun hashf; - cn_hash_fun_multi hashf_multi; - - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); - bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); - bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); - bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - - hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; - - hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; - - hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_aeon) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_ipbc) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_stellite) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_masari) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_bittube2) + unsigned char out[32 * MAX_N]; + cn_hash_fun hashf; + cn_hash_fun_multi hashf_multi; + + xmrstak_algo algo = xmrstak_algo::invalid_algo; + + for(int algo_idx = 0; algo_idx < 2; ++algo_idx) { - unsigned char out[32 * MAX_N]; - cn_hash_fun hashf; + if(algo_idx == 0) + algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(); + else + algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); + if(algo == cryptonight) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf("This is a test", 14, out, ctx[0]); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); + hashf("This is a test", 14, out, ctx[0]); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + + hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; + + hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; + + hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; + } + else if(algo == cryptonight_lite) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; - hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]); - bResult = memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite); + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + } + else if(algo == cryptonight_monero) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; - hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]); - bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + } + else if(algo == cryptonight_aeon) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; - hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]); - bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + } + else if(algo == cryptonight_ipbc) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; + } + else if(algo == cryptonight_stellite) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + } + else if(algo == cryptonight_masari) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + } + else if(algo == cryptonight_heavy) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + } + else if(algo == cryptonight_haven) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven); + hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + } + else if(algo == cryptonight_bittube2) + { + unsigned char out[32 * MAX_N]; + cn_hash_fun hashf; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); + + hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]); + bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; + + hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]); + bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; + + hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]); + bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; + } + + if(!bResult) + printer::inst()->print_msg(L0, + "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); } + for (int i = 0; i < MAX_N; i++) cryptonight_free_ctx(ctx[i]); - if(!bResult) - printer::inst()->print_msg(L0, - "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); - return bResult; } From e7c8382708779a0447ec3e0541512b515b5bea33 Mon Sep 17 00:00:00 2001 From: Piotr Chromiec Date: Thu, 13 Sep 2018 13:52:49 +0200 Subject: [PATCH 11/77] AMD APP SDK 3.0 url fix dropbox link is broken --- doc/compile_Linux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md index 072402ff7..fce52b64c 100644 --- a/doc/compile_Linux.md +++ b/doc/compile_Linux.md @@ -4,7 +4,7 @@ ### AMD APP SDK 3.0 (only needed to use AMD GPUs) -- download and install the latest version from https://www.dropbox.com/sh/mpg882ekirnsfa7/AADWz5X-TgVdsmWt0QwMgTWLa/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2?dl=0 +- download and install the latest version from http://debian.nullivex.com/amd/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2 (see https://github.com/fireice-uk/xmr-stak/issues/1511#issuecomment-385120692) (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/228059) ### Cuda 8.0+ (only needed to use NVIDIA GPUs) From 931bd5fef17f908afc62836ae7b6ea087d1441ca Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 10 Sep 2018 16:49:59 +0200 Subject: [PATCH 12/77] unify cpu cryptonight implementations xmr-stak has several implementations for multi hash per thread. The results into 3 intepedent implementations. Each time the algorithm must be changed the possibility to introduce errors is very large. - unify the different cryptonight CPU implementations - simplify the function selection array to find the specilized cryptonight implementation - add a intermediat pointer to access the large state (similar to the old multi hash implementation) As side effect this change increases the speed of the single and multi hash algorithm. --- xmrstak/backend/amd/minethd.cpp | 2 +- xmrstak/backend/amd/minethd.hpp | 2 +- .../backend/cpu/crypto/cryptonight_aesni.h | 947 ++++++------------ xmrstak/backend/cpu/minethd.cpp | 478 ++------- xmrstak/backend/cpu/minethd.hpp | 7 +- xmrstak/backend/nvidia/minethd.cpp | 2 +- xmrstak/backend/nvidia/minethd.hpp | 2 +- 7 files changed, 373 insertions(+), 1067 deletions(-) diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index f7b47249e..d6051ffcd 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -252,7 +252,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = results[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 3142117c5..04c2ff8ad 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -24,7 +24,7 @@ class minethd : public iBackend static bool init_gpus(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg); diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 9f70bcfa7..89c508990 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -151,15 +151,15 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7) { - __m128i tmp0 = x0; - x0 = _mm_xor_si128(x0, x1); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_xor_si128(x2, x3); - x3 = _mm_xor_si128(x3, x4); - x4 = _mm_xor_si128(x4, x5); - x5 = _mm_xor_si128(x5, x6); - x6 = _mm_xor_si128(x6, x7); - x7 = _mm_xor_si128(x7, tmp0); + __m128i tmp0 = x0; + x0 = _mm_xor_si128(x0, x1); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_xor_si128(x2, x3); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_xor_si128(x4, x5); + x5 = _mm_xor_si128(x5, x6); + x6 = _mm_xor_si128(x6, x7); + x7 = _mm_xor_si128(x7, tmp0); } template @@ -467,712 +467,325 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } -template -void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) -{ - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); - - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) - { - memset(output, 0, 32); - return; - } - - keccak((const uint8_t *)input, len, ctx0->hash_state, 200); - - uint64_t monero_const; - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - { - monero_const = *reinterpret_cast(reinterpret_cast(input) + 35); - monero_const ^= *(reinterpret_cast(ctx0->hash_state) + 24); +#define CN_INIT_SINGLE \ + if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ + { \ + memset(output, 0, 32 * N); \ + return; \ } - // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - - uint8_t* l0 = ctx0->long_state; - uint64_t* h0 = (uint64_t*)ctx0->hash_state; - - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t ah0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - - // Optim - 90% time boundary - for(size_t i = 0; i < ITERATIONS; i++) - { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(ah0, al0)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); - } - - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - else - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - - idx0 = _mm_cvtsi128_si64(cx); - - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - bx0 = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - - lo = _umul128(idx0, cl, &hi); - - al0 += hi; - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - al0 ^= cl; - if(PREFETCH) - _mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0); - ah0 += lo; - - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const ^ ((uint64_t*)&l0[idx0 & MASK])[0]; - else - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const; - } - else - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; - ah0 ^= ch; - - idx0 = al0; +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ + uint64_t monero_const; \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ + monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ + } \ + /* Optim - 99% time boundary */ \ + cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \ + \ + __m128i ax0; \ + uint64_t idx0; \ + __m128i bx0; \ + uint8_t* l0 = ctx[n]->long_state; \ + { \ + uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ + idx0 = h0[0] ^ h0[4]; \ + ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ + bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + } \ + __m128i *ptr0 - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; - } - else if(ALGO == cryptonight_haven) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; - } +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ + __m128i cx; \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + cx = _mm_load_si128(ptr0); \ + if (ALGO == cryptonight_bittube2) \ + { \ + cx = aes_round_bittube2(cx, ax0); \ + } \ + else \ + { \ + if(SOFT_AES) \ + cx = soft_aesenc(cx, ax0); \ + else \ + cx = _mm_aesenc_si128(cx, ax0); \ } - // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - - // Optim - 99% time boundary - - keccakf((uint64_t*)ctx0->hash_state, 24); - extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); -} - -// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon -// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output -// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) -template -void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) -{ - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); - - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) - { - memset(output, 0, 64); - return; - } +#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ + else \ + _mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \ + idx0 = _mm_cvtsi128_si64(cx); \ + \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ + bx0 = cx; \ + +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + uint64_t lo, cl, ch; \ + uint64_t al0 = _mm_cvtsi128_si64(ax0); \ + uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ + cl = ((uint64_t*)ptr0)[0]; \ + ch = ((uint64_t*)ptr0)[1]; \ + \ + { \ + uint64_t hi; \ + lo = _umul128(idx0, cl, &hi); \ + ah0 += lo; \ + al0 += hi; \ + } \ + ((uint64_t*)ptr0)[0] = al0; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0) + - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); - keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); +#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ + else \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ + } \ + else \ + ((uint64_t*)ptr0)[1] = ah0; \ + al0 ^= cl; \ + ah0 ^= ch; \ + ax0 = _mm_set_epi64x(ah0, al0); \ + idx0 = al0; - uint64_t monero_const_0, monero_const_1; - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - { - monero_const_0 = *reinterpret_cast(reinterpret_cast(input) + 35); - monero_const_0 ^= *(reinterpret_cast(ctx[0]->hash_state) + 24); - monero_const_1 = *reinterpret_cast(reinterpret_cast(input) + len + 35); - monero_const_1 ^= *(reinterpret_cast(ctx[1]->hash_state) + 24); +#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ + { \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = d ^ q; \ + } \ + else if(ALGO == cryptonight_haven) \ + { \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = (~d) ^ q; \ } - // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); - cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; +#define CN_FINALIZE(n) \ + /* Optim - 90% time boundary */ \ + cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \ + /* Optim - 99% time boundary */ \ + keccakf((uint64_t*)ctx[n]->hash_state, 24); \ + extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n) - uint64_t axl0 = h0[0] ^ h0[4]; - uint64_t axh0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - uint64_t axl1 = h1[0] ^ h1[4]; - uint64_t axh1 = h1[1] ^ h1[5]; - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - uint64_t idx1 = h1[0] ^ h1[4]; +//! defer the evaluation of an macro +#ifndef _MSC_VER +# define CN_DEFER(...) __VA_ARGS__ +#else +# define CN_EMPTY(...) +# define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() +#endif - // Optim - 90% time boundary - for (size_t i = 0; i < ITERATIONS; i++) +//! execute the macro f with the passed arguments +#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__) + +/** add append n to all arguments and keeps n as first argument + * + * @param n number which is appended to the arguments (expect the first argument n) + * + * @code{.cpp} + * CN_ENUM_2(1, foo, bar) + * // is transformed to + * 1, foo1, bar1 + * @endcode + */ +#define CN_ENUM_0(n, ...) n +#define CN_ENUM_1(n, x1) n, x1 ## n +#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n +#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n +#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n +#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n +#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n +#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n +#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n +#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n +#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n +#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n +#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n + +/** repeat a macro call multiple times + * + * @param n number of arguments followed after f + * @param f name of the macro which should be executed + * @param ... n parameter which name will get appended by a unique number + * + * @code{.cpp} + * REPEAT_2(2, f, foo, bar) + * // is transformed to + * f(0, foo0, bar); f(1, foo1, bar1) + * @endcode + */ +#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)) +#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)) +#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)) +#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)) +#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__)) + +template< size_t N> +struct Cryptonight_hash; + +template< > +struct Cryptonight_hash<1> +{ + static constexpr size_t N = 1; + + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(axh0, axl0)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0)); - } - - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - else - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - - idx0 = _mm_cvtsi128_si64(cx); - bx0 = cx; - - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - - cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(axh1, axl1)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1)); - } - - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx)); - else - _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx)); - - idx1 = _mm_cvtsi128_si64(cx); - bx1 = cx; - - if(PREFETCH) - _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0); - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - - lo = _umul128(idx0, cl, &hi); - - axl0 += hi; - axh0 += lo; - ((uint64_t*)&l0[idx0 & MASK])[0] = axl0; - - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0 ^ ((uint64_t*)&l0[idx0 & MASK])[0]; - else - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0; - } else - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0; - - axh0 ^= ch; - axl0 ^= cl; - idx0 = axl0; + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); + CN_INIT_SINGLE; + REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; - } - else if(ALGO == cryptonight_haven) + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; + REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - - cl = ((uint64_t*)&l1[idx1 & MASK])[0]; - ch = ((uint64_t*)&l1[idx1 & MASK])[1]; - - lo = _umul128(idx1, cl, &hi); - - axl1 += hi; - axh1 += lo; - ((uint64_t*)&l1[idx1 & MASK])[0] = axl1; + REPEAT_1(0, CN_FINALIZE); + } +}; - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1 ^ ((uint64_t*)&l1[idx1 & MASK])[0]; - else - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1; - } else - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1; +template< > +struct Cryptonight_hash<2> +{ + static constexpr size_t N = 2; - axh1 ^= ch; - axl1 ^= cl; - idx1 = axl1; + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; - int64_t q = n / (d | 0x5); + CN_INIT_SINGLE; + REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; - } - else if(ALGO == cryptonight_haven) + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; + REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - if(PREFETCH) - _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0); + REPEAT_2(0, CN_FINALIZE); } +}; - // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); - cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state); - - // Optim - 99% time boundary - - keccakf((uint64_t*)ctx[0]->hash_state, 24); - extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); - keccakf((uint64_t*)ctx[1]->hash_state, 24); - extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); -} - -#define CN_STEP1(a, b, c, l, ptr, idx) \ - ptr = (__m128i *)&l[idx & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ - c = _mm_load_si128(ptr); - -#define CN_STEP2(a, b, c, l, ptr, idx) \ - if (ALGO == cryptonight_bittube2) \ - { \ - c = aes_round_bittube2(c, a); \ - } \ - else \ - { \ - if(SOFT_AES) \ - c = soft_aesenc(c, a); \ - else \ - c = _mm_aesenc_si128(c, a); \ - } \ - b = _mm_xor_si128(b, c); \ - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - cryptonight_monero_tweak((uint64_t*)ptr, b); \ - else \ - _mm_store_si128(ptr, b);\ - -#define CN_STEP3(a, b, c, l, ptr, idx) \ - idx = _mm_cvtsi128_si64(c); \ - ptr = (__m128i *)&l[idx & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ - b = _mm_load_si128(ptr); - -#define CN_STEP4(a, b, c, l, mc, ptr, idx) \ - lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \ - a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \ - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ - ((uint64_t*)ptr)[1] ^= ((uint64_t*)ptr)[0];\ - } \ - else \ - _mm_store_si128(ptr, a);\ - a = _mm_xor_si128(a, b); \ - idx = _mm_cvtsi128_si64(a); \ - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ - { \ - int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ - int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ - idx = d ^ q; \ - } \ - else if(ALGO == cryptonight_haven) \ - { \ - int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ - int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ - idx = (~d) ^ q; \ - } - -#define CONST_INIT(ctx, n) \ - __m128i mc##n = _mm_set_epi64x(*reinterpret_cast(reinterpret_cast(input) + n * len + 35) ^ \ - *(reinterpret_cast((ctx)->hash_state) + 24), 0); - -// This lovelier creation will do 3 cn hashes at a time. -template -void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template< > +struct Cryptonight_hash<3> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 3; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 3); - return; - } + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 3; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + CN_INIT_SINGLE; + REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - } + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - for (size_t i = 0; i < 3; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + REPEAT_3(0, CN_FINALIZE); } -} +}; -// This even lovelier creation will do 4 cn hashes at a time. -template -void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template< > +struct Cryptonight_hash<4> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 4; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 4); - return; - } + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 4; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + CN_INIT_SINGLE; + REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - CONST_INIT(ctx[3], 3); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - uint8_t* l3 = ctx[3]->long_state; - uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - __m128i cx3 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2, idx3; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - idx3 = _mm_cvtsi128_si64(ax3); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2, *ptr3; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); - } + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - for (size_t i = 0; i < 4; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + REPEAT_4(0, CN_FINALIZE); } -} +}; -// This most lovely creation will do 5 cn hashes at a time. -template -void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template< > +struct Cryptonight_hash<5> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 5; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 5); - return; - } + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 5; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + CN_INIT_SINGLE; + REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - CONST_INIT(ctx[3], 3); - CONST_INIT(ctx[4], 4); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - uint8_t* l3 = ctx[3]->long_state; - uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; - uint8_t* l4 = ctx[4]->long_state; - uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]); - __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - __m128i cx3 = _mm_set_epi64x(0, 0); - __m128i cx4 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2, idx3, idx4; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - idx3 = _mm_cvtsi128_si64(ax3); - idx4 = _mm_cvtsi128_si64(ax4); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); - CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); - CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4); - } + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - for (size_t i = 0; i < 5; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + REPEAT_5(0, CN_FINALIZE); } -} +}; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index a8452ebb1..93ce218a3 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -234,7 +234,7 @@ bool minethd::self_test() unsigned char out[32 * MAX_N]; cn_hash_fun hashf; - cn_hash_fun_multi hashf_multi; + cn_hash_fun hashf_multi; xmrstak_algo algo = xmrstak_algo::invalid_algo; @@ -248,37 +248,37 @@ bool minethd::self_test() if(algo == cryptonight) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); + hashf("This is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); + hashf("This is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; - hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; - hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" @@ -289,7 +289,7 @@ bool minethd::self_test() else if(algo == cryptonight_lite) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite); @@ -298,71 +298,71 @@ bool minethd::self_test() else if(algo == cryptonight_monero) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } else if(algo == cryptonight_aeon) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; } else if(algo == cryptonight_ipbc) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; } else if(algo == cryptonight_stellite) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; } else if(algo == cryptonight_masari) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; } else if(algo == cryptonight_heavy) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; } else if(algo == cryptonight_haven) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; } else if(algo == cryptonight_bittube2) @@ -372,13 +372,13 @@ bool minethd::self_test() hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); - hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]); + hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx); bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; - hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]); + hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx); bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; - hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]); + hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } @@ -438,8 +438,10 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work return pvThreads; } -minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +template +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) { + static_assert(N >= 1, "number of threads must be >= 1" ); // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -483,46 +485,55 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr } static const cn_hash_fun func_table[] = { - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; @@ -532,333 +543,14 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr return func_table[ algv << 2 | digit.to_ulong() ]; } -void minethd::work_main() +minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) { - if(affinity >= 0) //-1 means no affinity - bindMemoryToNUMANode(affinity); - - order_fix.set_value(); - std::unique_lock lck(thd_aff_set); - lck.release(); - std::this_thread::yield(); - - cryptonight_ctx* ctx; - uint64_t iCount = 0; - uint64_t* piHashVal; - uint32_t* piNonce; - job_result result; - - // start with root algorithm and switch later if fork version is reached - auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - ctx = minethd_alloc_ctx(); - - piHashVal = (uint64_t*)(result.bResult + 24); - piNonce = (uint32_t*)(oWork.bWorkBlob + 39); - result.iThreadId = iThreadNo; - - uint8_t version = 0; - size_t lastPoolId = 0; - - while (bQuit == 0) - { - if (oWork.bStall) - { - /* We are stalled here because the executor didn't find a job for us yet, - * either because of network latency, or a socket problem. Since we are - * raison d'etre of this software it us sensible to just wait until we have something - */ - - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - globalStates::inst().consume_work(oWork, iJobNo); - continue; - } - - size_t nonce_ctr = 0; - constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2 - - assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); - memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID)); - - if(oWork.bNiceHash) - result.iNonce = *piNonce; - - uint8_t new_version = oWork.getVersion(); - if(new_version != version || oWork.iPoolId != lastPoolId) - { - coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId); - if(new_version >= coinDesc.GetMiningForkVersion()) - { - miner_algo = coinDesc.GetMiningAlgo(); - hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - } - else - { - miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - } - result.algorithm = miner_algo; - lastPoolId = oWork.iPoolId; - version = new_version; - } - - while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) - { - if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes - { - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); - } - - if((nonce_ctr++ & (nonce_chunk-1)) == 0) - { - globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk); - // check if the job is still valid, there is a small posibility that the job is switched - if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) - break; - } - - *piNonce = result.iNonce; - - hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); - - if (*piHashVal < oWork.iTarget) - executor::inst()->push_event(ex_event(result, oWork.iPoolId)); - result.iNonce++; - - std::this_thread::yield(); - } - - globalStates::inst().consume_work(oWork, iJobNo); - } - - cryptonight_free_ctx(ctx); + return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo); } -minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +void minethd::work_main() { - // We have two independent flag bits in the functions - // therefore we will build a binary digit and select the - // function as a two digit binary - - uint8_t algv; - switch(algo) - { - case cryptonight: - algv = 2; - break; - case cryptonight_lite: - algv = 1; - break; - case cryptonight_monero: - algv = 0; - break; - case cryptonight_heavy: - algv = 3; - break; - case cryptonight_aeon: - algv = 4; - break; - case cryptonight_ipbc: - algv = 5; - break; - case cryptonight_stellite: - algv = 6; - break; - case cryptonight_masari: - algv = 7; - break; - case cryptonight_haven: - algv = 8; - break; - case cryptonight_bittube2: - algv = 9; - break; - default: - algv = 2; - break; - } - - static const cn_hash_fun_multi func_table[] = { - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash - }; - - std::bitset<2> digit; - digit.set(0, !bHaveAes); - digit.set(1, !bNoPrefetch); - - return func_table[algv << 4 | (N-2) << 2 | digit.to_ulong()]; + multiway_work_main<1u>(); } void minethd::double_work_main() @@ -926,7 +618,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun_multi hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); uint8_t version = 0; size_t lastPoolId = 0; @@ -961,12 +653,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 2d40ce314..26478542c 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -22,7 +22,7 @@ class minethd : public iBackend static std::vector thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id); @@ -30,8 +30,9 @@ class minethd : public iBackend static cryptonight_ctx* minethd_alloc_ctx(); private: - typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**); - static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + + template + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 88a1acc32..486a990e3 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -300,7 +300,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index d4ae03864..389356842 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -28,7 +28,7 @@ class minethd : public iBackend static bool self_test(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); void start_mining(); From c5ac310a7c65fdb0824c8c813f553bd90fddbb2b Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Sat, 15 Sep 2018 12:13:24 -0600 Subject: [PATCH 13/77] Update `doc/FAQ.md` with unified proper methods for Linux limits --- doc/FAQ.md | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/FAQ.md b/doc/FAQ.md index 50897659f..2d2820166 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -45,20 +45,35 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin ## Error: MEMORY ALLOC FAILED: mmap failed -On Linux you will need to configure large page support and increase your ulimit -l. +On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`). -To set large page support, add the following lines to `/etc/sysctl.conf` (`/etc/sysctl.d/xmr-stak.conf` for [Arch Linux](https://www.archlinux.org/news/deprecation-of-etcsysctlconf/) and its derivatives): +Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time). Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work). + +To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults): + + $ /sbin/sysctl vm.nr_hugepages ; ulimit -l + vm.nr_hugepages = 0 + 16 + +To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`: vm.nr_hugepages=128 -To increase the ulimit, add following lines to `/etc/security/limits.conf`: +You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot). In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits) - * soft memlock 262144 - * hard memlock 262144 +To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`: + + * - memlock 262144 + root - memlock 262144 You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session). +Recheck after completing these steps to validate: + + $ /sbin/sysctl vm.nr_hugepages ; ulimit -l + vm.nr_hugepages = 128 + 262144 -You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. +You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly). ## Illegal Instruction From 8a2f294d20b396aec08ab0b333ed25f3011c36fc Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 16 Sep 2018 20:24:50 +0200 Subject: [PATCH 14/77] fix that type of `memChunk` is not tested There is a copy past mistake tha tthe type of the variable `memChunk` is not tested. --- xmrstak/backend/amd/jconf.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 0f39ff2b9..9e15c930c 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -142,14 +142,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; } - cfg.memChunk = (int)memChunk->GetInt64(); - - if(!idx->IsUint64() || cfg.memChunk > 18 ) + if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 ) { printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); return false; } + cfg.memChunk = (int)memChunk->GetInt64(); + if(!compMode->IsBool()) return false; From 2742ef094c6492b881b9fe0dc563e939e0d7d1d9 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 17 Sep 2018 08:44:05 +0200 Subject: [PATCH 15/77] avoid OpenCL binary missmatch Avoid that a OpenCL binary from the cache is used if the driver or xmr-stak version has changed. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 87721ac8f..dedc32692 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -17,6 +17,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/params.hpp" +#include "xmrstak/version.hpp" #include #include @@ -375,6 +376,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } + std::vector openCLDriverVer(1024); + if(ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx ); + return ERR_OCL_API; + } + xmrstak_algo miner_algo[2] = { ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(), ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() @@ -402,6 +410,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ std::string src_str(source_code); src_str += options; src_str += devNameVec.data(); + src_str += get_version_str(); + src_str += openCLDriverVer.data(); + std::string hash_hex_str; picosha2::hash256_hex_string(src_str, hash_hex_str); From 77160cf13a2beaf23c6fa2fad5180080b66583a0 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 19 Sep 2018 11:54:45 +0200 Subject: [PATCH 16/77] fix nicehash `invalid results` If the first bit of the nonce is `1` (this is very often if we use a nicehash pool) than it could be that some OpenCL implementations handle the 64bit representation of the 32bit nonce on the device side as signed integer. During a right bitshift we pull wrong ones from the wrong higher part of the 64bit nonce representation into the 32bit part of the nonce. The result will be that the computed share is invalid. - explicit cast the nonce on the device to `uint` to avoid any side effects --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 002472d3a..78cd30c3a 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -482,9 +482,14 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, State[10] = input[10]; ((uint *)State)[9] &= 0x00FFFFFFU; - ((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24; + ((uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; ((uint *)State)[10] &= 0xFF000000U; - ((uint *)State)[10] |= ((get_global_id(0) >> 8)); + /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) + * handle get_global_id and get_global_offset as signed long long int and add + * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` + * (even if it is correct casted to unsigned on the host) + */ + ((uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); for(int i = 11; i < 25; ++i) State[i] = 0x00UL; @@ -605,7 +610,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states tweak1_2 = as_uint2(input[4]); tweak1_2.s0 >>= 24; tweak1_2.s0 |= tweak1_2.s1 << 8; - tweak1_2.s1 = get_global_id(0); + tweak1_2.s1 = (uint)get_global_id(0); tweak1_2 ^= as_uint2(states[24]); #endif } @@ -918,7 +923,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } mem_fence(CLK_GLOBAL_MEM_FENCE); @@ -994,7 +999,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } } @@ -1072,7 +1077,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } } @@ -1133,7 +1138,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } } From 16da98867769892392d0308d93e989748f3dab4c Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 19 Sep 2018 13:00:04 +0200 Subject: [PATCH 17/77] OpenCL: avoid out of memory access During the initialization of the compile parameter for OpenCL it could be that the fixed size buffer is to small. To avoid this we are now using `std::string`. There is no problem by using `std::string` because this part of code is not perfromance critical. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 87721ac8f..273010800 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -388,11 +388,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ int threadMemMask = cn_select_mask(miner_algo[ii]); int hashIterations = cn_select_iter(miner_algo[ii]); - char options[512]; - snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", - hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, - int_port(hashMemSize), int(miner_algo[ii])); + std::string options; + options += " -DITERATIONS=" + std::to_string(hashIterations); + options += " -DMASK=" + std::to_string(threadMemMask); + options += " -DWORKSIZE=" + std::to_string(ctx->workSize); + options += " -DSTRIDED_INDEX=" + std::to_string(ctx->stridedIndex); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(1u << ctx->memChunk); + options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); + options += " -DMEMORY=" + std::to_string(hashMemSize); + options += " -DALGO=" + std::to_string(miner_algo[ii]); + /* create a hash for the compile time cache * used data: * - source code @@ -418,7 +423,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL); + ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL); if(ret != CL_SUCCESS) { size_t len; From b751af9448f658e36aca884af8dd26a34f618195 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 10 Sep 2018 08:01:57 +0200 Subject: [PATCH 18/77] introduce `cryptonight_v8` and `monero8` - rmeove currency `monero7` - introduce `cryptonight_v8` and `monero8` --- xmrstak/backend/cpu/minethd.cpp | 17 ++++++++++++++++- xmrstak/backend/cryptonight.hpp | 13 +++++++++++++ xmrstak/jconf.cpp | 3 ++- xmrstak/misc/executor.cpp | 2 +- xmrstak/net/jpsock.cpp | 3 +++ 5 files changed, 35 insertions(+), 3 deletions(-) diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 93ce218a3..e11c82009 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -305,6 +305,16 @@ bool minethd::self_test() hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } + else if(algo == cryptonight_monero_v8) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + } else if(algo == cryptonight_aeon) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); @@ -533,7 +543,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index b6f656138..6b1afa928 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -16,6 +16,7 @@ enum xmrstak_algo cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + cryptonight_monero_v8 = 11 }; // define aeon settings @@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTONIGH template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } @@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MEMORY; @@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } + template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } @@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MASK; @@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } @@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight: return CRYPTONIGHT_ITER; case cryptonight_ipbc: diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index b6580ea9a..609b55f72 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -99,12 +99,13 @@ xmrstak::coin_selection coins[] = { { "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr }, { "cryptonight_v7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr }, { "graft", {cryptonight_monero, cryptonight, 8u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "haven", {cryptonight_haven, cryptonight_heavy, 3u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, - { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, + { "monero8", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, { "qrl", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 11d0f6df0..02ac8b7f5 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -560,7 +560,7 @@ void executor::ex_main() else pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true); break; - + case cryptonight_monero_v8: case cryptonight_monero: if(dev_tls) pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false); diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 9fce9b7e5..d20ba082f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_monero: algo_name = "cryptonight_v7"; break; + case cryptonight_monero_v8: + algo_name = "cryptonight_v8"; + break; case cryptonight_aeon: algo_name = "cryptonight_lite_v7"; break; From 69f550cb72c4fe22aa6ea6ca8f477559e1899a14 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 10 Sep 2018 08:05:59 +0200 Subject: [PATCH 19/77] CPU: cryptonight_v8 Add support for single hash cryptonight_v8. Co-authored-by: SChernykh --- .../backend/cpu/crypto/cryptonight_aesni.h | 135 ++++++++++++++---- xmrstak/backend/cpu/minethd.cpp | 3 + 2 files changed, 114 insertions(+), 24 deletions(-) diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 89c508990..273476096 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -19,6 +19,7 @@ #include "xmrstak/backend/cryptonight.hpp" #include #include +#include #ifdef __GNUC__ #include @@ -422,6 +423,27 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) +{ + __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); + x = _mm_sqrt_sd(_mm_setzero_pd(), x); + uint64_t r = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + +#if defined _MSC_VER || (__GNUC__ >= 7) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +#else + // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence + // Fallback to simpler code + if (x2 < n0) ++r; +#endif + return _mm_cvtsi64_si128(r); +} + inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) { alignas(16) uint32_t k[4]; @@ -467,6 +489,51 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } +inline void set_float_rounding_mode() +{ +#ifdef _MSC_VER + _control87(RC_DOWN, MCW_RC); +#else + std::fesetround(FE_DOWNWARD); +#endif +} + +#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } + +#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t sqrt_result = static_cast(_mm_cvtsi128_si64(sqrt_result_xmm)); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \ + const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ + /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ + sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \ + } + #define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ { \ @@ -474,7 +541,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -489,16 +556,27 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint64_t idx0; \ __m128i bx0; \ uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + __m128i sqrt_result_xmm; \ + /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ idx0 = h0[0] ^ h0[4]; \ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \ + set_float_rounding_mode(); \ + } \ } \ __m128i *ptr0 - -#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ __m128i cx; \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ cx = _mm_load_si128(ptr0); \ @@ -512,7 +590,8 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) cx = soft_aesenc(cx, ax0); \ else \ cx = _mm_aesenc_si128(cx, ax0); \ - } + } \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -524,15 +603,22 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - bx0 = cx; \ + if(ALGO != cryptonight_monero_v8) \ + bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ - \ + CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ @@ -542,7 +628,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ((uint64_t*)ptr0)[0] = al0; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - #define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -622,6 +707,9 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) #define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n #define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n #define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n /** repeat a macro call multiple times * @@ -657,15 +745,14 @@ struct Cryptonight_hash<1> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - - REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -687,14 +774,14 @@ struct Cryptonight_hash<2> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -716,14 +803,14 @@ struct Cryptonight_hash<3> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -745,14 +832,14 @@ struct Cryptonight_hash<4> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -774,14 +861,14 @@ struct Cryptonight_hash<5> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index e11c82009..87f4d3285 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -489,6 +489,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_bittube2: algv = 9; break; + case cryptonight_monero_v8: + algv = 10; + break; default: algv = 2; break; From 5608f8df39504e69c2c1aaaa8ff5e60a83b06ee4 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 10 Sep 2018 08:30:36 +0200 Subject: [PATCH 20/77] OpenCl: cryptonight_v8 - implement cryptonight_v8 - update auto adjust to fit the special requirements of `cryptonight_v8` - add fast math integer implementation for `sqrt`, `reciprocal` and `division` Co-authored-by: SChernykh --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 27 +++- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 138 ++++++++++++++---- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 136 +++++++++++++++++ xmrstak/backend/amd/autoAdjust.hpp | 20 ++- 4 files changed, 290 insertions(+), 31 deletions(-) create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 8d9b66853..bb39c5764 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -901,6 +901,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) //char* source_code = LoadTextFile(sSourcePath); + const char *fastIntMathV2CL = + #include "./opencl/fast_int_math_v2.cl" + ; const char *cryptonightCL = #include "./opencl/cryptonight.cl" ; @@ -921,6 +924,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) ; std::string source_code(cryptonightCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -930,16 +934,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + for(int i = 0; i < num_gpus; ++i) { + const std::string backendName = xmrstak::params::inst().openCLVendor; if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; - const std::string backendName = xmrstak::params::inst().openCLVendor; printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } + if(useCryptonight_v8) + { + if(ctx[i].stridedIndex == 1) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + } + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 78cd30c3a..778c8d5ba 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif +//#include "opencl/fast_int_math_v2.cl" +XMRSTAK_INCLUDE_FAST_INT_MATH_V2 //#include "opencl/wolf-aes.cl" XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" @@ -556,6 +558,8 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_GLOBAL_MEM_FENCE); } + +#define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads @@ -565,9 +569,24 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ) { - ulong a[2], b[2]; + ulong a[2]; + +// cryptonight_monero_v8 +#if(ALGO==11) + ulong b[4]; + uint4 b_x[2]; +#else + ulong b[2]; + uint4 b_x[1]; +#endif __local uint AES0[256], AES1[256], AES2[256], AES3[256]; +// cryptonight_monero_v8 +#if(ALGO==11) + __local uint RCP[256]; + uint2 division_result; + uint sqrt_result; +#endif const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) @@ -577,6 +596,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES1[i] = rotate(tmp, 8U); AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); +// cryptonight_monero_v8 +#if(ALGO==11) + RCP[i] = RCP_C[i]; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -584,7 +607,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint2 tweak1_2; #endif - uint4 b_x; + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -604,7 +627,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] = states[1] ^ states[5]; b[1] = states[3] ^ states[7]; - b_x = ((uint4 *)b)[0]; + b_x[0] = ((uint4 *)b)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + a[1] = states[1] ^ states[5]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; + b_x[1] = ((uint4 *)b)[1]; + division_result = as_uint2(states[12]); + sqrt_result = as_uint2(states[13]).s0; +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) tweak1_2 = as_uint2(input[4]); @@ -622,37 +655,81 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0]; + ulong idx0 = a[0] & MASK; #pragma unroll 8 for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; + ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif - b_x ^= ((uint4 *)c)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint table = 0x75310U; + b_x[0] ^= ((uint4 *)c)[0]; // cryptonight_stellite # if(ALGO == 7) - uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); # else - uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); # endif - b_x.s2 ^= ((table >> index) & 0x30U) << 24; + b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; + SCRATCHPAD_CHUNK(0) = b_x[0]; +// cryptonight_monero_v8 +#elif(ALGO==11) + SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; +#else + b_x[0] ^= ((uint4 *)c)[0]; + SCRATCHPAD_CHUNK(0) = b_x[0]; #endif - Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; - + idx0 = c[0] & MASK; uint4 tmp; - tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; - + tmp = SCRATCHPAD_CHUNK(0); +// cryptonight_monero_v8 +#if(ALGO==11) + // Use division and square root results from the _previous_ iteration to hide the latency + tmp.s0 ^= division_result.s0; + tmp.s1 ^= division_result.s1 ^ sqrt_result; + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + division_result = fast_div_v2(RCP, c[1], d); + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); +#endif +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); @@ -663,39 +740,42 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= tweak1_2; # endif #else - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; #endif ((uint4 *)a)[0] ^= tmp; - idx0 = a[0]; - - b_x = ((uint4 *)c)[0]; + idx0 = a[0] & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = d ^ q; -#endif + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = (d ^ q) & MASK; // cryptonight_haven -#if (ALGO == 9) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; +#elif (ALGO == 9) + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = (~d) ^ q; + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = ((~d) ^ q) & MASK; +#endif + +// cryptonight_monero_v8 +#if (ALGO == 11) + b_x[1] = b_x[0]; #endif + b_x[0] = ((uint4 *)c)[0]; } } mem_fence(CLK_GLOBAL_MEM_FENCE); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl new file mode 100644 index 000000000..fe7cea1ee --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -0,0 +1,136 @@ +R"===( +/* + * @author SChernykh + */ +static const __constant uint RCP_C[256] = +{ + 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu, + 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au, + 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu, + 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u, + 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u, + 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u, + 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u, + 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u, + 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u, + 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u, + 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u, + 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u, + 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au, + 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u, + 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u, + 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu, + 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u, + 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u, + 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u, + 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u, + 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu, + 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u, + 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u, + 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u, + 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u, + 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu, + 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau, + 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u, + 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u, + 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu, + 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u, + 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, +}; + +inline uint get_reciprocal(const __local uchar *RCP, uint a) +{ + const uint index1 = (a & 0x7F000000U) >> 21; + const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768; + + const uint r1 = *(const __local uint*)(RCP + index1); + + uint r2_0 = *(const __local uint*)(RCP + index1 + 4); + if (index2 > 0) r2_0 >>= 16; + const int r2 = r2_0 & 0xFFFFU; + + const uint r = r1 - (uint)(mul24(r2, index2) >> 6); + + const ulong lo0 = (ulong)(r) * a; + ulong lo = lo0 + ((ulong)(a) << 32); + + a >>= 1; + const bool b = (a >= lo) || (lo >= lo0); + lo = a - lo; + + const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo; + return as_uint2(k).s1 + (b ? r : 0); +} + +inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) +{ + const uint r = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + + ulong q; + ((uint*)&q)[0] = as_uint2(k).s1;; + ((uint*)&q)[1] = (k < a) ? 1 : 0; + + const long tmp = a - q * b; + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + return (uint2)( + as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), + as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) + ); +} + +inline void fast_div_full_q(const __local uint *RCP, ulong a, uint b, ulong *q, uint *r) +{ + const uint rcp = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, rcp) + ((ulong)(as_uint2(a).s1) * rcp) + a; + + ((uint*)q)[0] = as_uint2(k).s1; + ((uint*)q)[1] = (k < a) ? 1 : 0; + + long tmp = a - (*q) * b; + + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + if (overshoot) + { + --(*q); + tmp += b; + } + + if (undershoot) + { + ++(*q); + tmp -= b; + } + + *r = tmp; +} + +inline uint fast_sqrt_v2(const ulong n1) +{ + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + + float x1 = native_rsqrt(x); + x = native_sqrt(x); + + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (((long)(x0) * x0) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + b) > 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + + return result; +} +)===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index d6acec971..4a2ffdb19 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -127,6 +127,24 @@ class autoAdjust minFreeMem = 512u * byteToMiB; } + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // set strided index to default + ctx.stridedIndex = 1; + + // nvidia performance is very bad if the scratchpad is not contiguous + if(ctx.isNVIDIA) + ctx.stridedIndex = 0; + + // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` + if(useCryptonight_v8) + ctx.stridedIndex = 2; + // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; @@ -153,7 +171,7 @@ class autoAdjust // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n" + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" " \"comp_mode\" : true\n" + " },\n"; } From d035dbc160de3df3a800e872c37453b1d277db2b Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 10 Sep 2018 08:35:00 +0200 Subject: [PATCH 21/77] NVIDIA: cryptonight_v8 implement `cryptonight_v8` --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 164 +++++++++++++++++- .../backend/nvidia/nvcc_code/cuda_extra.cu | 25 ++- 2 files changed, 184 insertions(+), 5 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 6c6475150..3e6279288 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -194,6 +194,31 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #endif } +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, val, src); + ((uint32_t*)&tmp)[1] = shuffle(ptr, sub, val, src2); + return tmp; +} + +__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0) +{ + uint64_t x = (n0 >> 12) + (1023ULL << 52); + const double xx = sqrt( *reinterpret_cast(&x) ); + uint64_t r = *reinterpret_cast(&xx); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + + if (x2 < n0) ++r; + + return r; +} + template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) @@ -227,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti const int start = partidx * batchsize; const int end = start + batchsize; uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; - uint32_t a, d[2], idx0; + uint32_t a, a1, d[2], idx0; uint32_t t1[2], t2[2], res; uint32_t tweak1_2[2]; @@ -250,7 +275,19 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = *(d_ctx_b + threads * 4 + thread); } } - d[1] = (d_ctx_b + thread * 4)[sub]; + + uint32_t bx1, division_result, sqrt_result; + if(ALGO == cryptonight_monero_v8) + { + d[1] = (d_ctx_b + thread * 12)[sub]; + bx1 = (d_ctx_b + thread * 12 + 4)[sub]; + + // must be valid only for `sub < 2` + division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2]; + } + else + d[1] = (d_ctx_b + thread * 4)[sub]; #pragma unroll 2 for ( i = start; i < end; ++i ) @@ -296,6 +333,10 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); + if(ALGO == cryptonight_monero_v8) + { + a1 = a; + } d[x] = a ^ t_fn0( x_0 & 0xff ) ^ t_fn1( (x_1 >> 8) & 0xff ) ^ @@ -303,6 +344,33 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti t_fn3( ( x_3 >> 24 ) ); } + // Shuffle the other 3x16 byte chunks in the current 64-byte cache line + if(ALGO == cryptonight_monero_v8) + { + // Shuffle constants here were chosen carefully + // to maximize permutation cycle length + // and have no 2-byte elements stay in their places + const uint32_t chunk1 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); + const uint32_t chunk2 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); + const uint32_t chunk3 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); + + uint32_t src = sub & 2; + const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); + const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); + const uint64_t cc3 = bx1_64 + chunk3_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); + + const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); + const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); + const uint64_t cc1 = bx0_64 + chunk1_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); + + const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); + const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); + const uint64_t cc2 = ax0_64 + chunk2_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); + + } //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); @@ -331,10 +399,76 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); + + if(ALGO == cryptonight_monero_v8 ) + { + const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1); + + // Use division and square root results from the _previous_ iteration to hide the latency + const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); + + + const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); + const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32); + + if(sub < 2) + *((uint64_t*)yy) ^= cl_rhs; + + + const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL; + + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + // + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + + // Compiler will optimize it to a single div instruction + const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); + + + const uint64_t division_result_tmp = static_cast(cx1 / dd) + ((cx1 % dd) << 32); + + division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; + + // Use division_result as an input for the square root to prevent parallel implementation in hardware + const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp); + sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2]; + } + uint32_t zz[2]; zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); - + // Shuffle the other 3x16 byte chunks in the current 64-byte cache line + if(ALGO == cryptonight_monero_v8) + { + // Shuffle constants here were chosen carefully + // to maximize permutation cycle length + // and have no 2-byte elements stay in their places + const uint32_t chunk1 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); + const uint32_t chunk2 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); + const uint32_t chunk3 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); + + uint32_t src = sub & 2; + const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); + const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); + const uint64_t cc3 = bx1_64 + chunk3_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); + + + + const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); + const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); + const uint64_t cc1 = bx0_64 + chunk1_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); + + const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); + const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); + const uint64_t cc2 = ax0_64 + chunk2_64; + storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); + } + t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) @@ -384,13 +518,31 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = (~d) ^ q; } + if(ALGO == cryptonight_monero_v8) + { + bx1 = d[(x + 1) % 2]; + } } } if ( bfactor > 0 ) { (d_ctx_a + thread * 4)[sub] = a; - (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_monero_v8) + { + (d_ctx_b + thread * 12)[sub] = d[1]; + (d_ctx_b + thread * 12 + 4)[sub] = bx1; + + if(sub < 2) + { + // must be valid only for `sub < 2` + (d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result; + (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result; + } + } + else + (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; @@ -534,6 +686,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } + else if(miner_algo == cryptonight_monero_v8) + { + cryptonight_core_gpu_hash(ctx, startNonce); + } else if(miner_algo == cryptonight_heavy) { cryptonight_core_gpu_hash(ctx, startNonce); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b455f55ca..1ea54ddba 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + if(ALGO == cryptonight_monero_v8) + { + memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); + // bx1 + XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); + memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 ); + // division_result + memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 ); + // sqrt_result + memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + } + else + memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); @@ -298,6 +310,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } + else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + { + // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) + ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; + } else ctx->d_ctx_state2 = ctx->d_ctx_state; @@ -340,6 +358,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + if(miner_algo == cryptonight_monero_v8) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, From 522ff6a67b222a3584964ac7a75e53da6187c279 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 14 Sep 2018 20:48:13 +0200 Subject: [PATCH 22/77] NVIDIA: optimize shuffle - use shared memory to exchange --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 141 +++++++++++------- 1 file changed, 83 insertions(+), 58 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 3e6279288..1273f89e9 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -252,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti const int start = partidx * batchsize; const int end = start + batchsize; uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; - uint32_t a, a1, d[2], idx0; + uint32_t a, d[2], idx0; uint32_t t1[2], t2[2], res; uint32_t tweak1_2[2]; @@ -296,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti for ( int x = 0; x < 2; ++x ) { j = ( ( idx0 & MASK ) >> 2 ) + sub; - + if(ALGO == cryptonight_bittube2) { uint32_t k[4]; @@ -327,50 +327,69 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti } } } + else if(ALGO == cryptonight_monero_v8) + { + + const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); + uint4 chunk0{}; + chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0); + chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0); + chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0); + chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0); + + const uint32_t x_0 = ((uint32_t*)&chunk0)[sub]; + const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4]; + const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4]; + const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4]; + d[x] = a ^ + t_fn0( x_0 & 0xff ) ^ + t_fn1( (x_1 >> 8) & 0xff ) ^ + t_fn2( (x_2 >> 16) & 0xff ) ^ + t_fn3( ( x_3 >> 24 ) ); + + uint4 value; + const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); + if(sub == 1) + ((uint64_t*)&value)[0] = tmp10; + const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); + if(sub == 1) + ((uint64_t*)&value)[1] = tmp20; + const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); + if(sub == 2) + ((uint64_t*)&value)[0] = tmp11; + const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); + if(sub == 2) + ((uint64_t*)&value)[1] = tmp21; + const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); + if(sub == 3) + ((uint64_t*)&value)[0] = tmp12; + const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); + if(sub == 3) + ((uint64_t*)&value)[1] = tmp22; + + if(sub > 0) + { + uint4 store{}; + ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; + ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; + + const int dest = sub + 1; + const int dest2 = dest == 4 ? 1 : dest; + *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; + } + } else { const uint32_t x_0 = loadGlobal32( long_state + j ); const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); - if(ALGO == cryptonight_monero_v8) - { - a1 = a; - } d[x] = a ^ t_fn0( x_0 & 0xff ) ^ t_fn1( (x_1 >> 8) & 0xff ) ^ t_fn2( (x_2 >> 16) & 0xff ) ^ t_fn3( ( x_3 >> 24 ) ); } - - // Shuffle the other 3x16 byte chunks in the current 64-byte cache line - if(ALGO == cryptonight_monero_v8) - { - // Shuffle constants here were chosen carefully - // to maximize permutation cycle length - // and have no 2-byte elements stay in their places - const uint32_t chunk1 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); - const uint32_t chunk2 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); - const uint32_t chunk3 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); - - uint32_t src = sub & 2; - const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); - const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); - const uint64_t cc3 = bx1_64 + chunk3_64; - storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); - - const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); - const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); - const uint64_t cc1 = bx0_64 + chunk1_64; - storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); - - const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); - const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); - const uint64_t cc2 = ax0_64 + chunk2_64; - storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); - - } //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); @@ -443,30 +462,36 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti // Shuffle the other 3x16 byte chunks in the current 64-byte cache line if(ALGO == cryptonight_monero_v8) { - // Shuffle constants here were chosen carefully - // to maximize permutation cycle length - // and have no 2-byte elements stay in their places - const uint32_t chunk1 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10) ); - const uint32_t chunk2 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20) ); - const uint32_t chunk3 = loadGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30) ); - - uint32_t src = sub & 2; - const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1); - const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1); - const uint64_t cc3 = bx1_64 + chunk3_64; - storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]); - - - - const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1); - const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1); - const uint64_t cc1 = bx0_64 + chunk1_64; - storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]); - - const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1); - const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1); - const uint64_t cc2 = ax0_64 + chunk2_64; - storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]); + uint4 value; + const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); + if(sub == 1) + ((uint64_t*)&value)[0] = tmp10; + const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); + if(sub == 1) + ((uint64_t*)&value)[1] = tmp20; + const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); + if(sub == 2) + ((uint64_t*)&value)[0] = tmp11; + const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); + if(sub == 2) + ((uint64_t*)&value)[1] = tmp21; + const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); + if(sub == 3) + ((uint64_t*)&value)[0] = tmp12; + const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); + if(sub == 3) + ((uint64_t*)&value)[1] = tmp22; + if(sub > 0) + { + const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); + uint4 store{}; + ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; + ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; + + const int dest = sub + 1; + const int dest2 = dest == 4 ? 1 : dest; + *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; + } } t1[1] = shuffle<4>(sPtr,sub, d[x], 1); From df1a4200ec781b1bd8a1d53e5a4fc8c8329672f9 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 15 Sep 2018 22:43:21 +0200 Subject: [PATCH 23/77] OpenCL: optimize NVIDIA pass Create a special pass for NVIDIA GPUs to load memory chunks first into the shared memory. Co-authored-by: SChernykh --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 53 ++++++++++++++++--- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 778c8d5ba..9f474da87 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -418,6 +418,9 @@ void AESExpandKey256(uint *keybuf) } } +)===" +R"===( + #define MEM_CHUNK (1<> 4) ^ N)]) - +// cryptonight_monero_v8 && NVIDIA +#if(ALGO==11 && defined(__NV_CL_C_VERSION)) +# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) +# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) +#else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) +#endif + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 @@ -575,6 +584,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(ALGO==11) ulong b[4]; uint4 b_x[2]; +// NVIDIA +# ifdef __NV_CL_C_VERSION + __local uint16 scratchpad_line_buf[WORKSIZE]; + __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0); +# endif #else ulong b[2]; uint4 b_x[1]; @@ -661,6 +675,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; +// cryptonight_monero_v8 && NVIDIA +#if(ALGO==11 && defined(__NV_CL_C_VERSION)) + ulong idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; +#endif ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 @@ -694,14 +713,24 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # endif b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; // cryptonight_monero_v8 #elif(ALGO==11) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; +# ifdef __NV_CL_C_VERSION + // flush shuffeled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; + idx0 = c[0] & MASK; + idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; +# else + idx0 = c[0] & MASK; +# endif #else b_x[0] ^= ((uint4 *)c)[0]; SCRATCHPAD_CHUNK(0) = b_x[0]; -#endif idx0 = c[0] & MASK; +#endif uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); // cryptonight_monero_v8 @@ -753,6 +782,16 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ((uint4 *)a)[0] ^= tmp; + +// cryptonight_monero_v8 +#if (ALGO == 11) +# if defined(__NV_CL_C_VERSION) + // flush shuffeled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; +# endif + b_x[1] = b_x[0]; +#endif + b_x[0] = ((uint4 *)c)[0]; idx0 = a[0] & MASK; // cryptonight_heavy || cryptonight_bittube2 @@ -771,16 +810,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states idx0 = ((~d) ^ q) & MASK; #endif -// cryptonight_monero_v8 -#if (ALGO == 11) - b_x[1] = b_x[0]; -#endif - b_x[0] = ((uint4 *)c)[0]; } } mem_fence(CLK_GLOBAL_MEM_FENCE); } +)===" +R"===( + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { From 28f41a6e8a4f562272f86c8de9b582e530a4221f Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 16 Sep 2018 20:38:10 +0200 Subject: [PATCH 24/77] AMD: add unroll option add option `unroll` for OpenCL to allow better tuning the main POW kernel. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 1 + xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 2 +- xmrstak/backend/amd/autoAdjust.hpp | 2 +- xmrstak/backend/amd/config.tpl | 4 +++- xmrstak/backend/amd/jconf.cpp | 12 ++++++++++-- xmrstak/backend/amd/jconf.hpp | 1 + xmrstak/backend/amd/minethd.cpp | 1 + 8 files changed, 19 insertions(+), 5 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index bb39c5764..767e53855 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -405,6 +405,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); options += " -DMEMORY=" + std::to_string(hashMemSize); options += " -DALGO=" + std::to_string(miner_algo[ii]); + options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* create a hash for the compile time cache * used data: diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5ab80b82a..63c5029d7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -27,6 +27,7 @@ struct GpuContext size_t workSize; int stridedIndex; int memChunk; + int unroll = 0; bool isNVIDIA = false; int compMode; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 9f474da87..7d0ad1818 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -671,7 +671,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states { ulong idx0 = a[0] & MASK; - #pragma unroll 8 + #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 4a2ffdb19..c5b331c87 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -172,7 +172,7 @@ class autoAdjust conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"comp_mode\" : true\n" + + " \"unroll\" : 8, \"comp_mode\" : true\n" + " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 28855f070..0101b7e2f 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -13,13 +13,15 @@ R"===( * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) + * unroll - allow to control how often the POW main loop is unrolled; valid range [0;128] * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 9e15c930c..cd2486973 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); memChunk = GetObjectMember(oThdConf, "mem_chunk"); + unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || - stridedIndex == nullptr || compMode == nullptr) + stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -149,6 +150,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) } cfg.memChunk = (int)memChunk->GetInt64(); + + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 ) + { + printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128"); + return false; + } + cfg.unroll = (int)unroll->GetInt64(); if(!compMode->IsBool()) return false; diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 580b69fe7..b852c5940 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -28,6 +28,7 @@ class jconf long long cpu_aff; int stridedIndex; int memChunk; + int unroll; bool compMode; }; diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index d6051ffcd..5ac246335 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -99,6 +99,7 @@ bool minethd::init_gpus() vGpuData[i].stridedIndex = cfg.stridedIndex; vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; + vGpuData[i].unroll = cfg.unroll; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; From 2932de6951dc94a4cff7eec45b70dcda770b9bc9 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 16 Sep 2018 22:23:36 +0200 Subject: [PATCH 25/77] assembler version for cryptonight_v8 Add @SChernykh assembler version for ryzen and intel processors. Co-authored-by: SChernykh --- .../cpu/crypto/asm/cryptonigh_v8_main_loop.S | 21 +++ .../crypto/asm/cryptonigh_v8_main_loop.asm | 18 ++ .../asm/cryptonigh_v8_main_loop_ivybridge.inc | 176 ++++++++++++++++++ .../asm/cryptonigh_v8_main_loop_ryzen.inc | 174 +++++++++++++++++ 4 files changed, 389 insertions(+) create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S new file mode 100644 index 000000000..cd747f7c5 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S @@ -0,0 +1,21 @@ +#define ALIGN .align +.intel_syntax noprefix +.section .text +.global cryptonigh_v8_mainloop_ivybridge_asm +.global cryptonigh_v8_mainloop_ryzen_asm + +ALIGN 64 +cryptonigh_v8_mainloop_ivybridge_asm: + sub rsp, 48 + mov rcx, rdi + #include "cryptonigh_v8_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +ALIGN 64 +cryptonigh_v8_mainloop_ryzen_asm: + sub rsp, 48 + mov rcx, rdi + #include "cryptonigh_v8_main_loop_ryzen.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm new file mode 100644 index 000000000..2101a59ce --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm @@ -0,0 +1,18 @@ +_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cryptonigh_v8_mainloop_ivybridge_asm +PUBLIC cryptonigh_v8_mainloop_ryzen_asm + +ALIGN 64 +cryptonigh_v8_mainloop_ivybridge_asm PROC + INCLUDE cryptonigh_v8_main_loop_ivybridge.inc + ret 0 +cryptonigh_v8_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cryptonigh_v8_mainloop_ryzen_asm PROC + INCLUDE cryptonigh_v8_main_loop_ryzen.inc + ret 0 +cryptonigh_v8_mainloop_ryzen_asm ENDP + +_TEXT_CNV8_MAINLOOP ENDS +END diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc new file mode 100644 index 000000000..ea7f799fd --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc @@ -0,0 +1,176 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + + ALIGN 64 +$main_loop_ivybridge: + movdqu xmm6, XMMWORD PTR [r10+rbx] + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movq rcx, xmm3 + movdqu XMMWORD PTR [rax+rbx], xmm2 + mov rax, rcx + movdqu XMMWORD PTR [r10+rbx], xmm1 + shl rax, 32 + xor rdi, rax + movq rbp, xmm6 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov r10, rbp + and r10d, 2097136 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r10+rbx] + lea r14, QWORD PTR [r10+rbx] + mov r12, QWORD PTR [r10+rbx+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test rdx, 524287 + je $sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +$sqrt_fixup_ivybridge_ret: + + mov r9, r10 + mov rax, rdi + mul rbp + + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + add r8, rdx + add r11, rax + movdqu xmm0, XMMWORD PTR [r10+rbx] + movdqu xmm2, XMMWORD PTR [r9+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov QWORD PTR [r14], r8 + xor r8, rdi + mov r10, r8 + mov QWORD PTR [r14+8], r11 + and r10d, 2097136 + xor r11, r12 + dec rsi + jne $main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_ivybridge_endp + +$sqrt_fixup_ivybridge: + dec rdx + mov r13, -4389456576512 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + mov r13, 4389456576511 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_ivybridge_ret + +$cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc new file mode 100644 index 000000000..5797f5497 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc @@ -0,0 +1,174 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 64 +$main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_ryzen + shr rdi, 19 + +$sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm0, XMMWORD PTR [r10+rbx] + movdqa xmm2, XMMWORD PTR [r9+rbx] + movdqa xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm4 + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_ryzen_endp + +$sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov rdx, 4389456576511 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + sub rcx, rdx + mov rdx, -4389456576512 + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_ryzen_ret + +$cnv2_main_loop_ryzen_endp: From 0a9a9abaf7afceeb923292630b93b0fe4830efea Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 16 Sep 2018 22:24:56 +0200 Subject: [PATCH 26/77] infrastructure to load asm code - add new option to `cpu.txt` named `asm` to select the asm code version - extent function selection method to choose assembler code for `cryptonight_v8` - update auto adjustment to add default value for option `asm` --- CMakeLists.txt | 11 +++++- xmrstak/backend/cpu/autoAdjust.hpp | 2 +- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 2 +- xmrstak/backend/cpu/config.tpl | 13 ++++--- .../backend/cpu/crypto/cryptonight_aesni.h | 21 +++++++++++ xmrstak/backend/cpu/jconf.cpp | 7 +++- xmrstak/backend/cpu/jconf.hpp | 1 + xmrstak/backend/cpu/minethd.cpp | 35 +++++++++++++++---- xmrstak/backend/cpu/minethd.hpp | 5 +-- 9 files changed, 81 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b385d..067bbd0a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE) endif() target_link_libraries(xmr-stak-c ${LIBS}) +enable_language(ASM) +# asm optimized monero v8 code +add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" +) +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + + # compile generic backend files file(GLOB BACKEND_CPP "xmrstak/*.cpp" @@ -472,7 +481,7 @@ add_library(xmr-stak-backend STATIC ${BACKEND_CPP} ) -target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm) # compile CUDA backend if(CUDA_FOUND) diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 57dbef053..8588fea8c 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -82,7 +82,7 @@ class autoAdjust conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 01d2280d8..a73de8618 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 2fc9a47ec..bfffc851e 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -7,10 +7,15 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make + * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen] + * - auto: used the default implementation (no assembler version) + * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + * - ryzen: AMD Ryzen (1xxx and 2xxx series) + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 * physical core CPU you should select cpu numbers 0-3. @@ -21,8 +26,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 273476096..0ab47e390 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -876,3 +876,24 @@ struct Cryptonight_hash<5> REPEAT_5(0, CN_FINALIZE); } }; + +extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); + +template +void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MEM = cn_select_memory(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + + if (asm_version == 1) + cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]); + else + cryptonigh_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); +} diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 49da7ae2d..1f9501c40 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *no_prefetch, *aff; + const Value *mode, *no_prefetch, *aff, *asm_version; mode = GetObjectMember(oThdConf, "low_power_mode"); no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + asm_version = GetObjectMember(oThdConf, "asm"); if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; @@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.iCpuAff = -1; + if(!asm_version->IsString()) + return false; + cfg.asm_version_str = asm_version->GetString(); + return true; } diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index be855036e..4ec9165d5 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -24,6 +24,7 @@ class jconf struct thd_cfg { int iMultiway; bool bNoPrefetch; + std::string asm_version_str; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 87f4d3285..f07c71481 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version) { this->backendType = iBackend::CPU; oWork = pWork; @@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, iJobNo = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; + asm_version_str = asm_version; std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); @@ -441,7 +442,7 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work else printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str); pvThreads.push_back(thd); } @@ -449,9 +450,31 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work } template -minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); + + // check for asm optimized version for cryptonight_v8 + if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + { + if(asm_version_str != "auto") + { + if(asm_version_str == "intel") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + return cryptonight_hash_v2_asm; + } + if(asm_version_str == "ryzen") + { + // AMD Ryzen (1xxx and 2xxx series) + return cryptonight_hash_v2_asm; + } + else + { + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + } + } + } // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -636,7 +659,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); uint8_t version = 0; size_t lastPoolId = 0; @@ -671,12 +694,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 26478542c..53ff93c15 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -32,9 +32,9 @@ class minethd : public iBackend private: template - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto"); - minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template void multiway_work_main(); @@ -60,6 +60,7 @@ class minethd : public iBackend bool bQuit; bool bNoPrefetch; + std::string asm_version_str = "auto"; }; } // namespace cpu From 0254553871ca33010d7cfe8cfe3d5b25b21cb013 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 16 Sep 2018 23:01:46 +0200 Subject: [PATCH 27/77] optimize single hash cryptonight_v8 If single hash is used the type of the variable to hold the intermediat sqrt value is changed from `__m128i` to `uint64_t` as suggested by @SChernykh --- .../backend/cpu/crypto/cryptonight_aesni.h | 88 ++++++++++++++----- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 0ab47e390..7c409d187 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef __GNUC__ #include @@ -423,7 +424,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } -inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) +inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) { __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); x = _mm_sqrt_sd(_mm_setzero_pd(), x); @@ -441,7 +442,7 @@ inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) // Fallback to simpler code if (x2 < n0) ++r; #endif - return _mm_cvtsi64_si128(r); + return r; } inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) @@ -489,6 +490,48 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } +/** optimal type for sqrt + * + * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected. + * + * @tparam N number of hashes per thread + */ +template +struct GetOptimalSqrtType +{ + using type = __m128i; +}; + +template<> +struct GetOptimalSqrtType<1u> +{ + using type = uint64_t; +}; +template +using GetOptimalSqrtType_t = typename GetOptimalSqrtType::type; + +/** assign a value and convert if necessary + * + * @param output output type + * @param input value which is assigned to output + * @{ + */ +inline void assign(__m128i& output, const uint64_t input) +{ + output = _mm_cvtsi64_si128(input); +} + +inline void assign(uint64_t& output, const uint64_t input) +{ + output = input; +} + +inline void assign(uint64_t& output, const __m128i& input) +{ + output = _mm_cvtsi128_si64(input); +} +/** @} */ + inline void set_float_rounding_mode() { #ifdef _MSC_VER @@ -511,14 +554,15 @@ inline void set_float_rounding_mode() _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } -#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \ +#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ if(ALGO == cryptonight_monero_v8) \ { \ - const uint64_t sqrt_result = static_cast(_mm_cvtsi128_si64(sqrt_result_xmm)); \ + uint64_t sqrt_result_tmp; \ + assign(sqrt_result_tmp, sqrt_result); \ /* Use division and square root results from the _previous_ iteration to hide the latency */ \ const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ - cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \ - const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ + const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ /* Most and least significant bits in the divisor are set to 1 \ * to make sure we don't divide by a small or even number, \ * so there are no shortcuts for such cases \ @@ -531,7 +575,7 @@ inline void set_float_rounding_mode() const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ - sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \ + assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ } #define CN_INIT_SINGLE \ @@ -541,7 +585,7 @@ inline void set_float_rounding_mode() return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \ +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -559,7 +603,7 @@ inline void set_float_rounding_mode() /* BEGIN cryptonight_monero_v8 variables */ \ __m128i bx1; \ __m128i division_result_xmm; \ - __m128i sqrt_result_xmm; \ + GetOptimalSqrtType_t sqrt_result; \ /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ @@ -570,7 +614,7 @@ inline void set_float_rounding_mode() { \ bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ - sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \ + assign(sqrt_result, h0[13]); \ set_float_rounding_mode(); \ } \ } \ @@ -606,13 +650,13 @@ inline void set_float_rounding_mode() if(ALGO != cryptonight_monero_v8) \ bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ - CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \ + CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ if(ALGO == cryptonight_monero_v8) \ { \ @@ -745,14 +789,14 @@ struct Cryptonight_hash<1> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -774,14 +818,14 @@ struct Cryptonight_hash<2> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -803,14 +847,14 @@ struct Cryptonight_hash<3> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -832,14 +876,14 @@ struct Cryptonight_hash<4> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -861,14 +905,14 @@ struct Cryptonight_hash<5> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } From db70071534692198336e6a09d61323eb18f09bc8 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 17 Sep 2018 08:22:51 +0200 Subject: [PATCH 28/77] CPU: change default for `asm` Remove the asm option `auto` by `off` --- xmrstak/backend/cpu/autoAdjust.hpp | 2 +- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 2 +- xmrstak/backend/cpu/config.tpl | 8 ++++---- xmrstak/backend/cpu/minethd.cpp | 2 +- xmrstak/backend/cpu/minethd.hpp | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 8588fea8c..28ff515d4 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -82,7 +82,7 @@ class autoAdjust conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index a73de8618..2bebf82d0 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index bfffc851e..e4da15fad 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -10,8 +10,8 @@ R"===( * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen] - * - auto: used the default implementation (no assembler version) + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [off, intel, ryzen] + * - off: used the default implementation (no assembler version) * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) * - ryzen: AMD Ryzen (1xxx and 2xxx series) * @@ -26,8 +26,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index f07c71481..2f01d5e90 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -457,7 +457,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc // check for asm optimized version for cryptonight_v8 if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) { - if(asm_version_str != "auto") + if(asm_version_str != "off") { if(asm_version_str == "intel") { diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 53ff93c15..eb77749f6 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -32,7 +32,7 @@ class minethd : public iBackend private: template - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto"); + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off"); minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); @@ -60,7 +60,7 @@ class minethd : public iBackend bool bQuit; bool bNoPrefetch; - std::string asm_version_str = "auto"; + std::string asm_version_str = "off"; }; } // namespace cpu From 354c208569500c4c65a9dd5ed6ac442fea75113e Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 17 Sep 2018 08:25:12 +0200 Subject: [PATCH 29/77] fix compiler incompatibilities - fix assembler code to pass the clang compiler - CMake: set asm file language - fix icc with gcc-7 compile issue with `_addcarry_u64` --- CMakeLists.txt | 1 + .../cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc | 4 ++-- .../backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc | 4 ++-- xmrstak/backend/cpu/crypto/cryptonight_aesni.h | 4 +++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 067bbd0a2..cf439227f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -459,6 +459,7 @@ endif() target_link_libraries(xmr-stak-c ${LIBS}) enable_language(ASM) +set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C) # asm optimized monero v8 code add_library(xmr-stak-asm STATIC diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc index ea7f799fd..1cc20b35a 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc @@ -157,14 +157,14 @@ $sqrt_fixup_ivybridge_ret: $sqrt_fixup_ivybridge: dec rdx - mov r13, -4389456576512 + movq r13, -4389456576512 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax add rax, r13 - mov r13, 4389456576511 + movq r13, 4389456576511 sub rcx, r13 mov r13d, -2147483647 imul rcx, rax diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc index 5797f5497..c564d8949 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc @@ -157,14 +157,14 @@ $sqrt_fixup_ryzen_ret: $sqrt_fixup_ryzen: movq r9, xmm2 dec rdi - mov rdx, 4389456576511 + movq rdx, 4389456576511 mov rax, rdi shr rdi, 19 shr rax, 20 mov rcx, rdi sub rcx, rax sub rcx, rdx - mov rdx, -4389456576512 + movq rdx, -4389456576512 add rax, rdx imul rcx, rax sub rcx, r9 diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 7c409d187..0838cfac4 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -435,7 +435,9 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); -#if defined _MSC_VER || (__GNUC__ >= 7) +#ifdef __INTEL_COMPILER + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r); +#elif defined(_MSC_VER) || (__GNUC__ >= 7) _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); #else // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence From 13fbb8a541db75484af7a457b2c892e7e0b5cbca Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 17 Sep 2018 09:16:06 +0200 Subject: [PATCH 30/77] asm compiler compatibility - add special asm version for win64 and linux - add cmake path for MSVC and other systems --- CMakeLists.txt | 40 ++-- .../cpu/crypto/asm/cryptonigh_v8_main_loop.S | 22 ++- .../crypto/asm/cryptonigh_v8_main_loop.asm | 8 +- ...yptonigh_v8_main_loop_ivybridge_linux.inc} | 22 +-- ...ryptonigh_v8_main_loop_ivybridge_win64.inc | 176 ++++++++++++++++++ ...> cryptonigh_v8_main_loop_ryzen_linux.inc} | 22 +-- .../cryptonigh_v8_main_loop_ryzen_win64.inc | 174 +++++++++++++++++ 7 files changed, 415 insertions(+), 49 deletions(-) rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge.inc => cryptonigh_v8_main_loop_ivybridge_linux.inc} (91%) create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen.inc => cryptonigh_v8_main_loop_ryzen_linux.inc} (92%) create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc diff --git a/CMakeLists.txt b/CMakeLists.txt index cf439227f..b51eb2ae4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -445,6 +445,26 @@ if(CMAKE_LINK_STATIC) endif() endif() +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + # asm optimized monero v8 code + enable_language(ASM_MASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" + ) +else() + # asm optimized monero v8 code + enable_language(ASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" + ) +endif() + +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + # compile C files file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c") @@ -456,17 +476,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99) if(MICROHTTPD_ENABLE) target_link_libraries(xmr-stak-c ${MHTD}) endif() -target_link_libraries(xmr-stak-c ${LIBS}) - -enable_language(ASM) -set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C) -# asm optimized monero v8 code -add_library(xmr-stak-asm - STATIC - "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" -) -set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) - +target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm) # compile generic backend files file(GLOB BACKEND_CPP @@ -509,7 +519,7 @@ if(CUDA_FOUND) ) endif() target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) - target_link_libraries(xmrstak_cuda_backend xmr-stak-backend) + target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm) endif() # compile AMD backend @@ -522,7 +532,7 @@ if(OpenCL_FOUND) ${OPENCLSRCFILES} ) target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} ) - target_link_libraries(xmrstak_opencl_backend xmr-stak-backend) + target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm) endif() # compile final binary @@ -538,7 +548,7 @@ endif() set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}") set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}") -target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend) +target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm) ################################################################################ # Install @@ -569,4 +579,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) else() # this rule is used if the install prefix is the build directory install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")") -endif() +endif() \ No newline at end of file diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S index cd747f7c5..736dac7de 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S @@ -1,21 +1,27 @@ #define ALIGN .align .intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn .section .text -.global cryptonigh_v8_mainloop_ivybridge_asm -.global cryptonigh_v8_mainloop_ryzen_asm +#endif +.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm) +.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm) -ALIGN 64 -cryptonigh_v8_mainloop_ivybridge_asm: +ALIGN 8 +FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cryptonigh_v8_main_loop_ivybridge.inc" + #include "cryptonigh_v8_main_loop_ivybridge_linux.inc" add rsp, 48 ret 0 -ALIGN 64 -cryptonigh_v8_mainloop_ryzen_asm: +ALIGN 8 +FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cryptonigh_v8_main_loop_ryzen.inc" + #include "cryptonigh_v8_main_loop_ryzen_linux.inc" add rsp, 48 ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm index 2101a59ce..7f2d6a584 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm @@ -2,15 +2,15 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cryptonigh_v8_mainloop_ivybridge_asm PUBLIC cryptonigh_v8_mainloop_ryzen_asm -ALIGN 64 +ALIGN 8 cryptonigh_v8_mainloop_ivybridge_asm PROC - INCLUDE cryptonigh_v8_main_loop_ivybridge.inc + INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc ret 0 cryptonigh_v8_mainloop_ivybridge_asm ENDP -ALIGN 64 +ALIGN 8 cryptonigh_v8_mainloop_ryzen_asm PROC - INCLUDE cryptonigh_v8_main_loop_ryzen.inc + INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc ret 0 cryptonigh_v8_mainloop_ryzen_asm ENDP diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc similarity index 91% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc index 1cc20b35a..23f6cc060 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc @@ -49,8 +49,8 @@ movq xmm0, rcx punpcklqdq xmm5, xmm0 - ALIGN 64 -$main_loop_ivybridge: + ALIGN 8 +main_loop_ivybridge: movdqu xmm6, XMMWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d @@ -105,10 +105,10 @@ $main_loop_ivybridge: sqrtsd xmm3, xmm0 movq rdx, xmm3 test rdx, 524287 - je $sqrt_fixup_ivybridge + je sqrt_fixup_ivybridge psrlq xmm3, 19 psubq xmm3, XMMWORD PTR [rsp+16] -$sqrt_fixup_ivybridge_ret: +sqrt_fixup_ivybridge_ret: mov r9, r10 mov rax, rdi @@ -138,7 +138,7 @@ $sqrt_fixup_ivybridge_ret: and r10d, 2097136 xor r11, r12 dec rsi - jne $main_loop_ivybridge + jne main_loop_ivybridge ldmxcsr DWORD PTR [rsp] mov rbx, QWORD PTR [rsp+160] @@ -153,24 +153,24 @@ $sqrt_fixup_ivybridge_ret: pop rdi pop rsi pop rbp - jmp $cnv2_main_loop_ivybridge_endp + jmp cnv2_main_loop_ivybridge_endp -$sqrt_fixup_ivybridge: +sqrt_fixup_ivybridge: dec rdx - movq r13, -4389456576512 + movq r13, -4389456576512 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax add rax, r13 - movq r13, 4389456576511 + movq r13, 4389456576511 sub rcx, r13 mov r13d, -2147483647 imul rcx, rax sub rcx, r9 adc rdx, 0 movq xmm3, rdx - jmp $sqrt_fixup_ivybridge_ret + jmp sqrt_fixup_ivybridge_ret -$cnv2_main_loop_ivybridge_endp: +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc new file mode 100644 index 000000000..ee7f31716 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc @@ -0,0 +1,176 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + + ALIGN 8 +main_loop_ivybridge: + movdqu xmm6, XMMWORD PTR [r10+rbx] + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movd rcx, xmm3 + movdqu XMMWORD PTR [rax+rbx], xmm2 + mov rax, rcx + movdqu XMMWORD PTR [r10+rbx], xmm1 + shl rax, 32 + xor rdi, rax + movd rbp, xmm6 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov r10, rbp + and r10d, 2097136 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r10+rbx] + lea r14, QWORD PTR [r10+rbx] + mov r12, QWORD PTR [r10+rbx+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movd rdx, xmm3 + test rdx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +sqrt_fixup_ivybridge_ret: + + mov r9, r10 + mov rax, rdi + mul rbp + + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + add r8, rdx + add r11, rax + movdqu xmm0, XMMWORD PTR [r10+rbx] + movdqu xmm2, XMMWORD PTR [r9+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov QWORD PTR [r14], r8 + xor r8, rdi + mov r10, r8 + mov QWORD PTR [r14+8], r11 + and r10d, 2097136 + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13, -4389456576512 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + mov r13, 4389456576511 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc similarity index 92% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc index c564d8949..551ee8573 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc @@ -45,8 +45,8 @@ movq xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 64 -$main_loop_ryzen: + ALIGN 8 +main_loop_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm0, r11 movq xmm6, r8 @@ -103,10 +103,10 @@ $main_loop_ryzen: sqrtsd xmm1, xmm0 movq rdi, xmm1 test rdi, 524287 - je $sqrt_fixup_ryzen + je sqrt_fixup_ryzen shr rdi, 19 -$sqrt_fixup_ryzen_ret: +sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 @@ -136,7 +136,7 @@ $sqrt_fixup_ryzen_ret: and r10d, 2097136 movdqa xmm3, xmm5 dec ebp - jne $main_loop_ryzen + jne main_loop_ryzen ldmxcsr DWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp+48] @@ -152,23 +152,23 @@ $sqrt_fixup_ryzen_ret: pop r13 pop r12 pop rdi - jmp $cnv2_main_loop_ryzen_endp + jmp cnv2_main_loop_ryzen_endp -$sqrt_fixup_ryzen: +sqrt_fixup_ryzen: movq r9, xmm2 dec rdi - movq rdx, 4389456576511 + movq rdx, 4389456576511 mov rax, rdi shr rdi, 19 shr rax, 20 mov rcx, rdi sub rcx, rax sub rcx, rdx - movq rdx, -4389456576512 + movq rdx, -4389456576512 add rax, rdx imul rcx, rax sub rcx, r9 adc rdi, 0 - jmp $sqrt_fixup_ryzen_ret + jmp sqrt_fixup_ryzen_ret -$cnv2_main_loop_ryzen_endp: +cnv2_main_loop_ryzen_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc new file mode 100644 index 000000000..f70dccef8 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc @@ -0,0 +1,174 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 8 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm0, r11 + movd xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movd r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movd rax, xmm0 + + div r9 + movd xmm0, rax + movd xmm1, rdx + punpckldq xmm0, xmm1 + movd r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm0, XMMWORD PTR [r10+rbx] + movdqa xmm2, XMMWORD PTR [r9+rbx] + movdqa xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm4 + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movd r9, xmm2 + dec rdi + mov rdx, 4389456576511 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + sub rcx, rdx + mov rdx, -4389456576512 + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: From ce84244ac99e0d4714150d52a6ec12a5a5f80621 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 18 Sep 2018 20:53:20 +0200 Subject: [PATCH 31/77] fix segfault if option `asm` is not in `cpu.txt` --- xmrstak/backend/cpu/jconf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 1f9501c40..a14be1732 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -114,7 +114,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) aff = GetObjectMember(oThdConf, "affine_to_cpu"); asm_version = GetObjectMember(oThdConf, "asm"); - if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) + if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr) return false; if(!mode->IsBool() && !mode->IsNumber()) From 9a2ef075264942829e580af613b8ac0d3f8831d2 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 18 Sep 2018 20:57:18 +0200 Subject: [PATCH 32/77] update docs and reintroduce monero7 - reintroduce monero7 until the POW is final - update docs (add cryptonigh_v8) --- README.md | 1 + xmrstak/backend/amd/config.tpl | 1 + xmrstak/jconf.cpp | 1 + xmrstak/pools.tpl | 4 +++- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e3b01328a..2fe1bc511 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ If your prefered coin is not listed, you can choose one of the following algorit - cryptonight_masari - cryptonight_v7 - cryptonight_v7_stellite + - cryptonight_v8 - 4MiB scratchpad memory - cryptonight_haven - cryptonight_heavy diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 0101b7e2f..63106bcb9 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -9,6 +9,7 @@ R"===( * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * (not allowed for cryptonight_v8 ans monero8) * 0 or false = use a contiguous block of memory per thread * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 609b55f72..c69d47ab8 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -105,6 +105,7 @@ xmrstak::coin_selection coins[] = { { "haven", {cryptonight_haven, cryptonight_heavy, 3u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, + { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, { "monero8", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, { "qrl", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 78f2315ac..9c3dd5a59 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -27,7 +27,8 @@ POOLCONF], * haven (automatic switch with block version 3 to cryptonight_haven) * intense * masari - * monero7 (use this for Monero's new PoW) + * monero7 + * monero8 (use this to support Monero's Oct 2018 fork) * qrl - Quantum Resistant Ledger * ryo * turtlecoin @@ -41,6 +42,7 @@ POOLCONF], * # 2MiB scratchpad memory * cryptonight * cryptonight_v7 + * cryptonight_v8 * # 4MiB scratchpad memory * cyrptonight_bittube2 * cryptonight_haven From 78bd54ff2f63a65f5c01848160f08324d04ea2d3 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 18 Sep 2018 21:09:05 +0200 Subject: [PATCH 33/77] fix naming `cryptonigh` to `cryptonight` --- CMakeLists.txt | 8 +++--- .../cpu/crypto/asm/cryptonigh_v8_main_loop.S | 27 ------------------- .../crypto/asm/cryptonigh_v8_main_loop.asm | 18 ------------- .../cpu/crypto/asm/cryptonight_v8_main_loop.S | 27 +++++++++++++++++++ .../crypto/asm/cryptonight_v8_main_loop.asm | 18 +++++++++++++ ...ptonight_v8_main_loop_ivybridge_linux.inc} | 0 ...ptonight_v8_main_loop_ivybridge_win64.inc} | 0 ... cryptonight_v8_main_loop_ryzen_linux.inc} | 0 ... cryptonight_v8_main_loop_ryzen_win64.inc} | 0 .../backend/cpu/crypto/cryptonight_aesni.h | 8 +++--- 10 files changed, 53 insertions(+), 53 deletions(-) delete mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S delete mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge_linux.inc => cryptonight_v8_main_loop_ivybridge_linux.inc} (100%) rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge_win64.inc => cryptonight_v8_main_loop_ivybridge_win64.inc} (100%) rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen_linux.inc => cryptonight_v8_main_loop_ryzen_linux.inc} (100%) rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen_win64.inc => cryptonight_v8_main_loop_ryzen_win64.inc} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index b51eb2ae4..eec03df9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -448,18 +448,18 @@ endif() if(CMAKE_C_COMPILER_ID MATCHES "MSVC") # asm optimized monero v8 code enable_language(ASM_MASM) - set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM) add_library(xmr-stak-asm STATIC - "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" + "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" ) else() # asm optimized monero v8 code enable_language(ASM) - set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY C) add_library(xmr-stak-asm STATIC - "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" + "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" ) endif() diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S deleted file mode 100644 index 736dac7de..000000000 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S +++ /dev/null @@ -1,27 +0,0 @@ -#define ALIGN .align -.intel_syntax noprefix -#ifdef __APPLE__ -# define FN_PREFIX(fn) _ ## fn -.text -#else -# define FN_PREFIX(fn) fn -.section .text -#endif -.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm) -.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm) - -ALIGN 8 -FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm): - sub rsp, 48 - mov rcx, rdi - #include "cryptonigh_v8_main_loop_ivybridge_linux.inc" - add rsp, 48 - ret 0 - -ALIGN 8 -FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm): - sub rsp, 48 - mov rcx, rdi - #include "cryptonigh_v8_main_loop_ryzen_linux.inc" - add rsp, 48 - ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm deleted file mode 100644 index 7f2d6a584..000000000 --- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm +++ /dev/null @@ -1,18 +0,0 @@ -_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE -PUBLIC cryptonigh_v8_mainloop_ivybridge_asm -PUBLIC cryptonigh_v8_mainloop_ryzen_asm - -ALIGN 8 -cryptonigh_v8_mainloop_ivybridge_asm PROC - INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc - ret 0 -cryptonigh_v8_mainloop_ivybridge_asm ENDP - -ALIGN 8 -cryptonigh_v8_mainloop_ryzen_asm PROC - INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc - ret 0 -cryptonigh_v8_mainloop_ryzen_asm ENDP - -_TEXT_CNV8_MAINLOOP ENDS -END diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S new file mode 100644 index 000000000..3aa8994dd --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S @@ -0,0 +1,27 @@ +#define ALIGN .align +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif +.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm) +.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm) + +ALIGN 8 +FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cryptonight_v8_main_loop_ivybridge_linux.inc" + add rsp, 48 + ret 0 + +ALIGN 8 +FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cryptonight_v8_main_loop_ryzen_linux.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm new file mode 100644 index 000000000..3c2bba619 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm @@ -0,0 +1,18 @@ +_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cryptonight_v8_mainloop_ivybridge_asm +PUBLIC cryptonight_v8_mainloop_ryzen_asm + +ALIGN 8 +cryptonight_v8_mainloop_ivybridge_asm PROC + INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc + ret 0 +cryptonight_v8_mainloop_ivybridge_asm ENDP + +ALIGN 8 +cryptonight_v8_mainloop_ryzen_asm PROC + INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc + ret 0 +cryptonight_v8_mainloop_ryzen_asm ENDP + +_TEXT_CNV8_MAINLOOP ENDS +END diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc similarity index 100% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc similarity index 100% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc similarity index 100% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc similarity index 100% rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 0838cfac4..844e4c045 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -923,8 +923,8 @@ struct Cryptonight_hash<5> } }; -extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); -extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); template void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) @@ -935,9 +935,9 @@ void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, crypto cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); if (asm_version == 1) - cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]); + cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); else - cryptonigh_v8_mainloop_ryzen_asm(ctx[0]); + cryptonight_v8_mainloop_ryzen_asm(ctx[0]); cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); keccakf((uint64_t*)ctx[0]->hash_state, 24); From 1692c543c6be416f5b6b14e1501c880e62ee5fe6 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 19 Sep 2018 18:05:47 +0200 Subject: [PATCH 34/77] asm, style and spelling fixes - fix code style issues - fix spelling issue - fix asm to support newer clang versions --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 4 ++-- xmrstak/backend/amd/config.tpl | 4 ++-- xmrstak/backend/amd/jconf.cpp | 4 ++-- xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S | 4 ++-- xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm | 4 ++-- .../crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc | 5 +++-- .../cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc | 6 +++--- xmrstak/backend/cpu/crypto/cryptonight_aesni.h | 6 +++--- xmrstak/backend/cpu/minethd.cpp | 2 +- 9 files changed, 20 insertions(+), 19 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 7d0ad1818..286bc39b6 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -718,7 +718,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #elif(ALGO==11) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION - // flush shuffeled data + // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; idx0 = c[0] & MASK; idxS = idx0 & 0x30; @@ -786,7 +786,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states // cryptonight_monero_v8 #if (ALGO == 11) # if defined(__NV_CL_C_VERSION) - // flush shuffeled data + // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; # endif b_x[1] = b_x[0]; diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 63106bcb9..043b05355 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -9,12 +9,12 @@ R"===( * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks - * (not allowed for cryptonight_v8 ans monero8) + * (not allowed for cryptonight_v8 and monero8) * 0 or false = use a contiguous block of memory per thread * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) - * unroll - allow to control how often the POW main loop is unrolled; valid range [0;128] + * unroll - allow to control how often the POW main loop is unrolled; valid range [0;128) - for most OpenCL implementations it must be a power of two. * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index cd2486973..777dbdbb5 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -151,9 +151,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) cfg.memChunk = (int)memChunk->GetInt64(); - if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 ) + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || ) { - printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128"); + printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two"); return false; } cfg.unroll = (int)unroll->GetInt64(); diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S index 3aa8994dd..b6be9438f 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S @@ -14,7 +14,7 @@ ALIGN 8 FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cryptonight_v8_main_loop_ivybridge_linux.inc" + #include "cryptonight_v8_main_loop_ivybridge_linux.inc" add rsp, 48 ret 0 @@ -22,6 +22,6 @@ ALIGN 8 FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cryptonight_v8_main_loop_ryzen_linux.inc" + #include "cryptonight_v8_main_loop_ryzen_linux.inc" add rsp, 48 ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm index 3c2bba619..a1615e9bd 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm @@ -4,13 +4,13 @@ PUBLIC cryptonight_v8_mainloop_ryzen_asm ALIGN 8 cryptonight_v8_mainloop_ivybridge_asm PROC - INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc + INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc ret 0 cryptonight_v8_mainloop_ivybridge_asm ENDP ALIGN 8 cryptonight_v8_mainloop_ryzen_asm PROC - INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc + INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc ret 0 cryptonight_v8_mainloop_ryzen_asm ENDP diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc index 23f6cc060..21f1f48c3 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc @@ -157,14 +157,15 @@ sqrt_fixup_ivybridge_ret: sqrt_fixup_ivybridge: dec rdx - movq r13, -4389456576512 + mov r13d, -1022 + shl r13, 32 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax add rax, r13 - movq r13, 4389456576511 + not r13 sub rcx, r13 mov r13d, -2147483647 imul rcx, rax diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc index 551ee8573..9c177b85a 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc @@ -157,14 +157,14 @@ sqrt_fixup_ryzen_ret: sqrt_fixup_ryzen: movq r9, xmm2 dec rdi - movq rdx, 4389456576511 + mov edx, -1022 + shl rdx, 32 mov rax, rdi shr rdi, 19 shr rax, 20 mov rcx, rdi sub rcx, rax - sub rcx, rdx - movq rdx, -4389456576512 + lea rcx, [rcx+rdx+1] add rax, rdx imul rcx, rax sub rcx, r9 diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 844e4c045..6edae905e 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -441,8 +441,8 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); #else // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence - // Fallback to simpler code - if (x2 < n0) ++r; + // Fallback to simpler code + if (x2 < n0) ++r; #endif return r; } @@ -733,7 +733,7 @@ inline void set_float_rounding_mode() /** add append n to all arguments and keeps n as first argument * * @param n number which is appended to the arguments (expect the first argument n) - * + * * @code{.cpp} * CN_ENUM_2(1, foo, bar) * // is transformed to diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 2f01d5e90..05743ae92 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -453,7 +453,7 @@ template minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); - + // check for asm optimized version for cryptonight_v8 if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) { From ac56ecbde8d19c5bc6ab8b76ab9dfebab1b7eb85 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 19 Sep 2018 20:21:24 +0200 Subject: [PATCH 35/77] cuda fast math for Monero pow v8 Add fast version for div and sqrt for the cuda backend --- .../nvcc_code/cuda_fast_int_math_v2.hpp | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp new file mode 100644 index 000000000..41ec70e1c --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -0,0 +1,106 @@ +#pragma once + +#include + +static __constant__ const uint32_t RCP_C[256] = +{ + 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu, + 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au, + 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu, + 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u, + 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u, + 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u, + 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u, + 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u, + 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u, + 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u, + 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u, + 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u, + 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au, + 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u, + 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u, + 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu, + 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u, + 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u, + 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u, + 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u, + 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu, + 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u, + 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u, + 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u, + 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u, + 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu, + 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau, + 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u, + 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u, + 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu, + 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u, + 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, +}; + +__device__ __forceinline__ uint32_t get_reciprocal(const uint32_t* RCP, uint32_t a) +{ + const uint32_t index1 = (a & 0x7F000000U) >> 23; + const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768; + + const uint32_t r1 = RCP[index1]; + uint32_t r2_0 = RCP[index1 + 1]; + if (index2 > 0) r2_0 >>= 16; + const int r2 = r2_0 & 0xFFFFU; + + const uint32_t r = r1 - (uint32_t)(__mul24(r2, index2) >> 6); + + const uint64_t lo0 = (uint64_t)(r) * a; + uint64_t lo = lo0 + ((uint64_t)(a) << 32); + + a >>= 1; + const bool b = (a >= lo) || (lo >= lo0); + lo = a - lo; + + const uint64_t k = __umulhi((uint32_t)lo, r) + ((uint64_t)(r) * ((uint32_t*)&lo)[1]) + lo; + return ((uint32_t*)&k)[1] + (b ? r : 0); +} + +__device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, uint32_t b) +{ + const uint32_t r = get_reciprocal(RCP, b); + const uint64_t k = __umulhi((uint32_t)a, r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; + + uint32_t q[2]; + q[0] = ((uint32_t*)&k)[1]; + q[1] = (k < a) ? 1 : 0; + + const int64_t tmp = a - *((uint64_t*)(q)) * b; + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); + q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U); + + return *((uint64_t*)(q)); +} + +__device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) +{ + float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23)); + float x1; + asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x)); + asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x)); + + // The following line does x1 *= 4294967296.0f; + x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23)); + + const uint32_t x0 = __float_as_uint(x) - (158U << 23); + const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18); + const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1; + + uint32_t result = (x0 << 10) + __float2int_rn(delta); + const uint32_t s = result >> 1; + const uint32_t b = result & 1; + + const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1; + if ((int64_t)(x2 + b) > 0) --result; + if ((int64_t)(x2 + 0x100000000UL + s) < 0) ++result; + + return result; +} From 659918f26bf07a49059417735f02626545ca1f36 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 19 Sep 2018 20:50:32 +0200 Subject: [PATCH 36/77] NVIDIA: optimize div and sqrt - use optimzed div and sqrt - reduce memory footprint --- xmrstak/backend/amd/jconf.cpp | 2 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 62 ++++++------------- 2 files changed, 19 insertions(+), 45 deletions(-) diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 777dbdbb5..fb1a04b4c 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -151,7 +151,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) cfg.memChunk = (int)memChunk->GetInt64(); - if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || ) + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128) { printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two"); return false; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 1273f89e9..4e34e75a9 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -7,6 +7,8 @@ #include #include "xmrstak/jconf.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" + #ifdef _WIN32 #include @@ -203,22 +205,6 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3 return tmp; } -__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0) -{ - uint64_t x = (n0 >> 12) + (1023ULL << 52); - const double xx = sqrt( *reinterpret_cast(&x) ); - uint64_t r = *reinterpret_cast(&xx); - - const uint64_t s = r >> 20; - r >>= 19; - - uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); - - if (x2 < n0) ++r; - - return r; -} - template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) @@ -229,6 +215,12 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); + __shared__ uint32_t RCP[256]; + for (int i = threadIdx.x; i < 256; i += blockDim.x) + { + RCP[i] = RCP_C[i]; + } + __syncthreads( ); @@ -284,7 +276,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti // must be valid only for `sub < 2` division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; - sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; } else d[1] = (d_ctx_b + thread * 4)[sub]; @@ -421,39 +413,23 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti if(ALGO == cryptonight_monero_v8 ) { - const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1); - // Use division and square root results from the _previous_ iteration to hide the latency const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); - - const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); - const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32); - + uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); + ((uint32_t*)&division_result_64)[1] ^= sqrt_result; + if(sub < 2) - *((uint64_t*)yy) ^= cl_rhs; - - - const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL; + *((uint64_t*)yy) ^= division_result_64; - // Most and least significant bits in the divisor are set to 1 - // to make sure we don't divide by a small or even number, - // so there are no shortcuts for such cases - // - // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 - // We drop the highest bit to fit both quotient and remainder in 32 bits - - // Compiler will optimize it to a single div instruction + const uint32_t dd = (static_cast(cx0) + (sqrt_result << 1)) | 0x80000001UL; const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); - - - const uint64_t division_result_tmp = static_cast(cx1 / dd) + ((cx1 % dd) << 32); + const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd); division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; // Use division_result as an input for the square root to prevent parallel implementation in hardware - const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp); - sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2]; + sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp); } uint32_t zz[2]; @@ -706,7 +682,6 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { - if(miner_algo == cryptonight_monero) { cryptonight_core_gpu_hash(ctx, startNonce); @@ -745,11 +720,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t } else if(miner_algo == cryptonight_haven) { - cryptonight_core_gpu_hash(ctx, startNonce); + cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_bittube2) { - cryptonight_core_gpu_hash(ctx, startNonce); + cryptonight_core_gpu_hash(ctx, startNonce); } - } From fd27561be68abaf435bd1296eb9d35f7e790e57c Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 19 Sep 2018 21:35:35 +0200 Subject: [PATCH 37/77] NVIDIA: optimze v8 - fix that shared memory for fast div is always used even if an algorithm is not using it - optimize fast div algo - store `division_result` (64_bit) per thread instead of shuffle around and store it as 32bit --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 30 ++++++++++--------- .../nvcc_code/cuda_fast_int_math_v2.hpp | 8 ++--- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 4e34e75a9..563814702 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -215,10 +215,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); - __shared__ uint32_t RCP[256]; - for (int i = threadIdx.x; i < 256; i += blockDim.x) + uint32_t* RCP; + if(ALGO == cryptonight_monero_v8) { - RCP[i] = RCP_C[i]; + __shared__ uint32_t RCP_shared[256]; + for (int i = threadIdx.x; i < 256; i += blockDim.x) + { + RCP_shared[i] = RCP_C[i]; + } + RCP = RCP_shared; } @@ -268,14 +273,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti } } - uint32_t bx1, division_result, sqrt_result; + uint32_t bx1, sqrt_result; + uint64_t division_result; if(ALGO == cryptonight_monero_v8) { d[1] = (d_ctx_b + thread * 12)[sub]; bx1 = (d_ctx_b + thread * 12 + 4)[sub]; // must be valid only for `sub < 2` - division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; + division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; } else @@ -415,21 +421,17 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti { // Use division and square root results from the _previous_ iteration to hide the latency const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); - - uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); - ((uint32_t*)&division_result_64)[1] ^= sqrt_result; + ((uint32_t*)&division_result)[1] ^= sqrt_result; if(sub < 2) - *((uint64_t*)yy) ^= division_result_64; + *((uint64_t*)yy) ^= division_result; const uint32_t dd = (static_cast(cx0) + (sqrt_result << 1)) | 0x80000001UL; const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); - const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd); - - division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; - + division_result = fast_div_v2(RCP, cx1, dd); + // Use division_result as an input for the square root to prevent parallel implementation in hardware - sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp); + sqrt_result = fast_sqrt_v2(cx0 + division_result); } uint32_t zz[2]; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 41ec70e1c..2a25a9c07 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -71,11 +71,11 @@ __device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, q[1] = (k < a) ? 1 : 0; const int64_t tmp = a - *((uint64_t*)(q)) * b; - const bool overshoot = (tmp < 0); - const bool undershoot = (tmp >= b); + const uint32_t overshoot = (tmp < 0) ? 1u : 0U; + const uint32_t undershoot = (tmp >= b) ? 1u : 0U; - q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); - q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U); + q[0] += undershoot - overshoot; + q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U); return *((uint64_t*)(q)); } From 2818a4481eb23d0971974879b09707d07724942a Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 21 Sep 2018 20:53:31 +0200 Subject: [PATCH 38/77] NVIDIA: sqrt optimization cryptonight_v8 Avoid branche differegence --- xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 2a25a9c07..e3220230a 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -99,8 +99,9 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) const uint32_t b = result & 1; const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1; - if ((int64_t)(x2 + b) > 0) --result; - if ((int64_t)(x2 + 0x100000000UL + s) < 0) ++result; + const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0; + const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0; + result += (overshoot+undershoot); return result; } From fce822e5f094d8bde9d0c3f3745d91129506ded0 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 21 Sep 2018 20:55:56 +0200 Subject: [PATCH 39/77] AMD: remove unused functions - remove unused host function (relict from old refactoring) - remove unused OpenCL full div function --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 21 -------------- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 28 ------------------- 2 files changed, 49 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 767e53855..e2c2dfeb8 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -611,27 +611,6 @@ const char* const attributeNames[] = { #define NELEMS(x) (sizeof(x) / sizeof((x)[0])) -void PrintDeviceInfo(cl_device_id device) -{ - char queryBuffer[1024]; - int queryInt; - cl_int clError; - clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DEVICE_NAME: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DRIVER_VERSION: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DEVICE_VERSION: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL); - printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt); -} - uint32_t getNumPlatforms() { cl_uint num_platforms = 0; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index fe7cea1ee..607806b7a 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -81,34 +81,6 @@ inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) ); } -inline void fast_div_full_q(const __local uint *RCP, ulong a, uint b, ulong *q, uint *r) -{ - const uint rcp = get_reciprocal((const __local uchar *)RCP, b); - const ulong k = mul_hi(as_uint2(a).s0, rcp) + ((ulong)(as_uint2(a).s1) * rcp) + a; - - ((uint*)q)[0] = as_uint2(k).s1; - ((uint*)q)[1] = (k < a) ? 1 : 0; - - long tmp = a - (*q) * b; - - const bool overshoot = (tmp < 0); - const bool undershoot = (tmp >= b); - - if (overshoot) - { - --(*q); - tmp += b; - } - - if (undershoot) - { - ++(*q); - tmp -= b; - } - - *r = tmp; -} - inline uint fast_sqrt_v2(const ulong n1) { float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); From e94296672a56afd2497bebc8fc1de9b2557bc7d5 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Sat, 22 Sep 2018 06:59:28 -0600 Subject: [PATCH 40/77] configEditor: add version tagging, line filtering per-platform --- xmrstak/backend/amd/config.tpl | 4 +- xmrstak/backend/cpu/config.tpl | 4 +- xmrstak/backend/nvidia/config.tpl | 4 +- xmrstak/config.tpl | 73 +++++++++++++++++-------------- xmrstak/misc/configEditor.hpp | 19 ++++++++ xmrstak/pools.tpl | 5 +-- 6 files changed, 68 insertions(+), 41 deletions(-) diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 043b05355..18ef8c696 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary. * index - GPU index number usually starts from 0 @@ -37,5 +38,4 @@ GPUCONFIG * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel. */ "platform_index" : PLATFORMINDEX, - )===" diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index e4da15fad..37158d6e2 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * Thread configuration for each thread. Make sure it matches the number above. * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true, @@ -38,5 +39,4 @@ R"===( [ CPUCONFIG ], - )===" diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 2aa68dc46..144da80b9 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * GPU configuration. You should play around with threads and blocks as the fastest settings will vary. * index - GPU index number usually starts from 0. @@ -35,5 +36,4 @@ R"===( [ GPUCONFIG ], - )===" diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index 14330a829..deb52aa09 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * Network timeouts. * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make @@ -58,43 +59,53 @@ R"===( * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration, * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is * meant as a backup, you won't get stellar results there. If you are running into trouble, especially - * on Windows, please read the common issues in the README. - * - * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows. - * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN - * - * 1. On the Start menu, click Run. In the Open box, type gpedit.msc. - * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings. - * 3. Expand Security Settings, and then expand Local Policies. - * 4. Select the User Rights Assignment folder. - * 5. The policies will be displayed in the details pane. - * 6. In the pane, double-click Lock pages in memory. - * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group. - * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on - * 9. Reboot for change to take effect. - * - * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need - * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory. - * - * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your - * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144" - * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT - * recommended for security reasons. - * - * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a - * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between - * locked and unlocked memory. If that is your setup see option "no_mlck". + * on Windows, please read the common issues in the README and FAQ. + * + * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.---WINDOWS + * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN---WINDOWS + *---WINDOWS + * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.---WINDOWS + * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.---WINDOWS + * 3. Expand Security Settings, and then expand Local Policies.---WINDOWS + * 4. Select the User Rights Assignment folder.---WINDOWS + * 5. The policies will be displayed in the details pane.---WINDOWS + * 6. In the pane, double-click Lock pages in memory.---WINDOWS + * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.---WINDOWS + * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on---WINDOWS + * 9. Reboot for change to take effect.---WINDOWS + *---WINDOWS + * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need---WINDOWS + * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.---WINDOWS + * On Linux you will need to configure large page support and increase your memlock limit (ulimit -l).---LINUX + *---LINUX + * To set large page support, add the following to "/etc/sysctl.d/60-hugepages.conf":---LINUX + * vm.nr_hugepages=128---LINUX + * You WILL need to run "sudo sysctl --system" for these settings to take effect on your system (or reboot).---LINUX + * In some cases (many threads, very large CPU, etc) you may need more than 128---LINUX + * (try 256 if there are still complaints from thread inits)---LINUX + *---LINUX + * To increase the memlock (ulimit -l), add following lines to /etc/security/limits.d/60-memlock.conf:---LINUX + * * - memlock 262144---LINUX + * root - memlock 262144---LINUX + * You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).---LINUX + *---LINUX + * Check with "/sbin/sysctl vm.nr_hugepages ; ulimit -l" to validate---LINUX + *---LINUX + * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX + * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX + * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX */ /* * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here: * always - Don't even try to use large pages. Always use slow memory. * warn - We will try to use large pages, but fall back to slow memory if that fails. - * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory. - * It will never use slow memory, but it won't attempt to mlock + * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.---LINUX + * It will never use slow memory, but it won't attempt to mlock---LINUX * never - If we fail to allocate large pages we will print an error and exit. */ -"use_slow_memory" : "warn", +"use_slow_memory" : "warn",---WINDOWS +"use_slow_memory" : "no_mlck",---LINUX /* * TLS Settings @@ -149,6 +160,4 @@ R"===( * This setting will only be needed in 2020's. No need to worry about it now. */ "prefer_ipv4" : true, - )===" - diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp index d95ea6b72..3f79df44c 100644 --- a/xmrstak/misc/configEditor.hpp +++ b/xmrstak/misc/configEditor.hpp @@ -6,6 +6,7 @@ #include #include +#include "../version.hpp" namespace xmrstak { @@ -42,6 +43,24 @@ struct configEditor void write(const std::string filename) { + // endmarks: for filtering full lines inside the template string + // Platform marks are done globally here + // "---WINDOWS" endmark keeps lines when compiled for Windows + // "---LINUX" endmark keeps lines when compiled for Linux (and anything not-windows) +#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__WINDOWS__) + // windows: + // completely drop lines with endmark-linux + replace(".*---LINUX\n", ""); + // strip off windows endmarks, keep the lines + replace("---WINDOWS\n", "\n"); +#else + // not-windows: + // completely drop lines with endmark-windows + replace(".*---WINDOWS\n", ""); + // strip off linux endmarks, keep the lines + replace("---LINUX\n", "\n"); +#endif + replace("XMRSTAK_VERSION", get_version_str()); std::ofstream out(filename); out << m_fileContent; out.close(); diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 9c3dd5a59..59c4ba9d6 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * pool_address - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported. * wallet_address - Your wallet, or pool login. @@ -50,6 +51,4 @@ POOLCONF], */ "currency" : "CURRENCY", - )===" - From 1fbfb1547ce6794615d20b1525bdd8dec3995048 Mon Sep 17 00:00:00 2001 From: BBSCoin Developer <43017551+bbscoindev@users.noreply.github.com> Date: Sat, 22 Sep 2018 23:41:25 -0700 Subject: [PATCH 41/77] Update BBSCoin config for preparing for the next fork --- xmrstak/jconf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index c69d47ab8..c0ef1a779 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -88,7 +88,7 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion { "aeon7", {cryptonight_aeon, cryptonight_lite, 7u}, {cryptonight_aeon, cryptonight_lite, 7u}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {cryptonight_monero, cryptonight, 3u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, + { "bbscoin", {cryptonight_lite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "bittube", {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, { "cryptonight", {cryptonight_monero, cryptonight, 255u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, From 957503b1d38583c5b80ab34fde25d71edff0ee48 Mon Sep 17 00:00:00 2001 From: BBSCoin Developer <43017551+bbscoindev@users.noreply.github.com> Date: Sun, 23 Sep 2018 00:02:48 -0700 Subject: [PATCH 42/77] Change BBSCoin PoW to CN lite v7 --- xmrstak/jconf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index c0ef1a779..355da8e6e 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -88,7 +88,7 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion { "aeon7", {cryptonight_aeon, cryptonight_lite, 7u}, {cryptonight_aeon, cryptonight_lite, 7u}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {cryptonight_lite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, + { "bbscoin", {cryptonight_aeon, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "bittube", {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, { "cryptonight", {cryptonight_monero, cryptonight, 255u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, From f03319c33429376d333510058579b7ead6241aec Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Sat, 22 Sep 2018 09:08:30 -0600 Subject: [PATCH 43/77] telemetry: Add mutex to avoid push during recalc and other races --- xmrstak/misc/telemetry.cpp | 2 ++ xmrstak/misc/telemetry.hpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp index 5642f6b66..197da8eca 100644 --- a/xmrstak/misc/telemetry.cpp +++ b/xmrstak/misc/telemetry.cpp @@ -49,6 +49,7 @@ telemetry::telemetry(size_t iThd) double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) { + std::unique_lock lk(mtx); uint64_t iTimeNow = get_timestamp_ms(); uint64_t iEarliestHashCnt = 0; @@ -98,6 +99,7 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp) { + std::unique_lock lk(mtx); size_t iTop = iBucketTop[iThd]; ppHashCounts[iThd][iTop] = iHashCount; ppTimestamps[iThd][iTop] = iTimestamp; diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp index 309fd6d06..1813c00e6 100644 --- a/xmrstak/misc/telemetry.hpp +++ b/xmrstak/misc/telemetry.hpp @@ -2,6 +2,7 @@ #include #include +#include namespace xmrstak { @@ -14,6 +15,7 @@ class telemetry double calc_telemetry_data(size_t iLastMillisec, size_t iThread); private: + mutable std::mutex mtx; constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations constexpr static size_t iBucketMask = iBucketSize - 1; uint32_t* iBucketTop; From cac26b96d642f52071182b087f2001181d0d7a95 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 23 Sep 2018 21:02:29 +0200 Subject: [PATCH 44/77] iadd cryptonight_v8 tweak 2.2 add cpu implementation for the final monero POW --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 10 ++++-- ...yptonight_v8_main_loop_ivybridge_linux.inc | 12 +++++-- ...yptonight_v8_main_loop_ivybridge_win64.inc | 12 +++++-- .../cryptonight_v8_main_loop_ryzen_linux.inc | 10 ++++-- .../cryptonight_v8_main_loop_ryzen_win64.inc | 10 ++++-- .../backend/cpu/crypto/cryptonight_aesni.h | 31 ++++++++++++++----- xmrstak/backend/cpu/minethd.cpp | 4 +-- 7 files changed, 66 insertions(+), 23 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 286bc39b6..e65f0ed05 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -748,19 +748,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); #endif + ulong2 result_mul; + result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0); + result_mul.s1 = c[0] * as_ulong2(tmp).s0; // cryptonight_monero_v8 #if(ALGO==11) { - ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + result_mul ^= chunk2; ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); } #endif - a[1] += c[0] * as_ulong2(tmp).s0; - a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + a[1] += result_mul.s1; + a[0] += result_mul.s0; // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc index 21f1f48c3..bc4a82f86 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc @@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret: mov r9, r10 mov rax, rdi mul rbp + movq xmm0, rax + movq xmm1, rdx + punpcklqdq xmm1, xmm0 xor r9, 16 mov rcx, r10 xor rcx, 32 xor r10, 48 - add r8, rdx - add r11, rax - movdqu xmm0, XMMWORD PTR [r10+rbx] movdqu xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm1 + movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm0, xmm5 movdqu xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] paddq xmm2, xmm4 paddq xmm1, xmm7 movdqa xmm5, xmm4 @@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret: movdqa xmm4, xmm6 movdqu XMMWORD PTR [rcx+rbx], xmm2 movdqu XMMWORD PTR [r10+rbx], xmm1 + add r8, rdx + add r11, rax mov QWORD PTR [r14], r8 xor r8, rdi mov r10, r8 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc index ee7f31716..3687d999b 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc @@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret: mov r9, r10 mov rax, rdi mul rbp + movq xmm0, rax + movq xmm1, rdx + punpcklqdq xmm1, xmm0 xor r9, 16 mov rcx, r10 xor rcx, 32 xor r10, 48 - add r8, rdx - add r11, rax - movdqu xmm0, XMMWORD PTR [r10+rbx] movdqu xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm1 + movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm0, xmm5 movdqu xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] paddq xmm2, xmm4 paddq xmm1, xmm7 movdqa xmm5, xmm4 @@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret: movdqa xmm4, xmm6 movdqu XMMWORD PTR [rcx+rbx], xmm2 movdqu XMMWORD PTR [r10+rbx], xmm1 + add r8, rdx + add r11, rax mov QWORD PTR [r14], r8 xor r8, rdi mov r10, r8 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc index 9c177b85a..a375a661f 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc @@ -109,14 +109,20 @@ main_loop_ryzen: sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 mov r9d, r10d mov ecx, r10d xor r9d, 16 xor ecx, 32 xor r10d, 48 - movdqa xmm0, XMMWORD PTR [r10+rbx] - movdqa xmm2, XMMWORD PTR [r9+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + movdqa xmm0, XMMWORD PTR [r10+rbx] movdqa xmm1, XMMWORD PTR [rcx+rbx] paddq xmm0, xmm4 paddq xmm2, xmm3 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc index f70dccef8..a55004e42 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc @@ -109,14 +109,20 @@ main_loop_ryzen: sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 mov r9d, r10d mov ecx, r10d xor r9d, 16 xor ecx, 32 xor r10d, 48 - movdqa xmm0, XMMWORD PTR [r10+rbx] - movdqa xmm2, XMMWORD PTR [r9+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + movdqa xmm0, XMMWORD PTR [r10+rbx] movdqa xmm1, XMMWORD PTR [rcx+rbx] paddq xmm0, xmm4 paddq xmm2, xmm3 diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 6edae905e..c0f122fd6 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -543,7 +543,7 @@ inline void set_float_rounding_mode() #endif } -#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \ +#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ if(ALGO == cryptonight_monero_v8) \ { \ @@ -556,6 +556,21 @@ inline void set_float_rounding_mode() _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } +#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } + #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ if(ALGO == cryptonight_monero_v8) \ { \ @@ -637,7 +652,7 @@ inline void set_float_rounding_mode() else \ cx = _mm_aesenc_si128(cx, ax0); \ } \ - CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -659,18 +674,18 @@ inline void set_float_rounding_mode() cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ - CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ - if(ALGO == cryptonight_monero_v8) \ - { \ - bx1 = bx0; \ - bx0 = cx; \ - } \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ + CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ ah0 += lo; \ al0 += hi; \ } \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ ((uint64_t*)ptr0)[0] = al0; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0) diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 05743ae92..a344a9ffe 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -310,11 +310,11 @@ bool minethd::self_test() { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; } else if(algo == cryptonight_aeon) { From 915c868a487141c9a05439c2facb0fa21b1b8c8b Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 24 Sep 2018 20:11:22 +0200 Subject: [PATCH 45/77] disbale CUDA backend for cryptonight_v8 --- xmrstak/backend/nvidia/minethd.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 486a990e3..dc9b5fccf 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -144,6 +144,13 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor { std::vector* pvThreads = new std::vector(); + auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); + if(miner_algo == cryptonight_monero_v8) + { + std::cerr<<"ERROR: The CUDA backend is currently not supporting cryptonight_v8, please use `--openCLVendor NVIDIA` instead."< Date: Mon, 24 Sep 2018 20:21:09 +0200 Subject: [PATCH 46/77] optimize asm code cryptonight_v8 apply optimizations Co-authored-by: SChernykh --- ...yptonight_v8_main_loop_ivybridge_linux.inc | 72 ++++++++++--------- ...yptonight_v8_main_loop_ivybridge_win64.inc | 71 +++++++++--------- .../cryptonight_v8_main_loop_ryzen_linux.inc | 23 +++--- .../cryptonight_v8_main_loop_ryzen_win64.inc | 25 ++++--- 4 files changed, 99 insertions(+), 92 deletions(-) mode change 100644 => 100755 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc mode change 100644 => 100755 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc index bc4a82f86..cbe43b0d3 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc @@ -48,10 +48,10 @@ punpcklqdq xmm4, xmm0 movq xmm0, rcx punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] ALIGN 8 main_loop_ivybridge: - movdqu xmm6, XMMWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d mov eax, r10d @@ -63,28 +63,30 @@ main_loop_ivybridge: movq xmm7, r8 punpcklqdq xmm7, xmm0 aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm1, xmm7 - movdqu xmm2, XMMWORD PTR [rcx+rbx] paddq xmm0, xmm5 paddq xmm2, xmm4 movdqu XMMWORD PTR [rcx+rbx], xmm0 - movq rcx, xmm3 movdqu XMMWORD PTR [rax+rbx], xmm2 - mov rax, rcx movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx shl rax, 32 xor rdi, rax - movq rbp, xmm6 movdqa xmm0, xmm6 pxor xmm0, xmm4 - mov r10, rbp - and r10d, 2097136 movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r10+rbx] - lea r14, QWORD PTR [r10+rbx] - mov r12, QWORD PTR [r10+rbx+8] + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] xor edx, edx lea r9d, DWORD PTR [ecx+ecx] add r9d, ebp @@ -93,6 +95,7 @@ main_loop_ivybridge: or r9d, r13d movq rax, xmm0 div r9 + xorps xmm3, xmm3 mov eax, eax shl rdx, 32 add rdx, rax @@ -103,31 +106,37 @@ main_loop_ivybridge: movq xmm0, rax paddq xmm0, xmm8 sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] movq rdx, xmm3 - test rdx, 524287 + test edx, 524287 je sqrt_fixup_ivybridge psrlq xmm3, 19 psubq xmm3, XMMWORD PTR [rsp+16] sqrt_fixup_ivybridge_ret: - mov r9, r10 + mov ecx, r10d mov rax, rdi mul rbp - movq xmm0, rax - movq xmm1, rdx - punpcklqdq xmm1, xmm0 + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - movdqu xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm1 - movdqu xmm0, XMMWORD PTR [r10+rbx] + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm0, xmm5 movdqu xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] paddq xmm2, xmm4 paddq xmm1, xmm7 movdqa xmm5, xmm4 @@ -135,13 +144,8 @@ sqrt_fixup_ivybridge_ret: movdqa xmm4, xmm6 movdqu XMMWORD PTR [rcx+rbx], xmm2 movdqu XMMWORD PTR [r10+rbx], xmm1 - add r8, rdx - add r11, rax - mov QWORD PTR [r14], r8 - xor r8, rdi - mov r10, r8 - mov QWORD PTR [r14+8], r11 - and r10d, 2097136 + movdqu xmm6, [rdi+rbx] + mov r10d, edi xor r11, r12 dec rsi jne main_loop_ivybridge @@ -163,15 +167,15 @@ sqrt_fixup_ivybridge_ret: sqrt_fixup_ivybridge: dec rdx - mov r13d, -1022 - shl r13, 32 + mov r13d, -1022 + shl r13, 32 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax add rax, r13 - not r13 + not r13 sub rcx, r13 mov r13d, -2147483647 imul rcx, rax diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc old mode 100644 new mode 100755 index 3687d999b..8d49c5db7 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc @@ -48,10 +48,10 @@ punpcklqdq xmm4, xmm0 movd xmm0, rcx punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] ALIGN 8 main_loop_ivybridge: - movdqu xmm6, XMMWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d mov eax, r10d @@ -63,28 +63,30 @@ main_loop_ivybridge: movd xmm7, r8 punpcklqdq xmm7, xmm0 aesenc xmm6, xmm7 + movd rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm1, xmm7 - movdqu xmm2, XMMWORD PTR [rcx+rbx] paddq xmm0, xmm5 paddq xmm2, xmm4 movdqu XMMWORD PTR [rcx+rbx], xmm0 - movd rcx, xmm3 movdqu XMMWORD PTR [rax+rbx], xmm2 - mov rax, rcx movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movd rcx, xmm3 + mov rax, rcx shl rax, 32 xor rdi, rax - movd rbp, xmm6 movdqa xmm0, xmm6 pxor xmm0, xmm4 - mov r10, rbp - and r10d, 2097136 movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r10+rbx] - lea r14, QWORD PTR [r10+rbx] - mov r12, QWORD PTR [r10+rbx+8] + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] xor edx, edx lea r9d, DWORD PTR [ecx+ecx] add r9d, ebp @@ -93,6 +95,7 @@ main_loop_ivybridge: or r9d, r13d movd rax, xmm0 div r9 + xorps xmm3, xmm3 mov eax, eax shl rdx, 32 add rdx, rax @@ -103,31 +106,37 @@ main_loop_ivybridge: movd xmm0, rax paddq xmm0, xmm8 sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] movd rdx, xmm3 - test rdx, 524287 + test edx, 524287 je sqrt_fixup_ivybridge psrlq xmm3, 19 psubq xmm3, XMMWORD PTR [rsp+16] sqrt_fixup_ivybridge_ret: - mov r9, r10 + mov ecx, r10d mov rax, rdi mul rbp - movq xmm0, rax - movq xmm1, rdx - punpcklqdq xmm1, xmm0 + movd xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movd xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - movdqu xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm1 - movdqu xmm0, XMMWORD PTR [r10+rbx] + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] paddq xmm0, xmm5 movdqu xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] paddq xmm2, xmm4 paddq xmm1, xmm7 movdqa xmm5, xmm4 @@ -135,13 +144,8 @@ sqrt_fixup_ivybridge_ret: movdqa xmm4, xmm6 movdqu XMMWORD PTR [rcx+rbx], xmm2 movdqu XMMWORD PTR [r10+rbx], xmm1 - add r8, rdx - add r11, rax - mov QWORD PTR [r14], r8 - xor r8, rdi - mov r10, r8 - mov QWORD PTR [r14+8], r11 - and r10d, 2097136 + movdqu xmm6, [rdi+rbx] + mov r10d, edi xor r11, r12 dec rsi jne main_loop_ivybridge @@ -163,14 +167,15 @@ sqrt_fixup_ivybridge_ret: sqrt_fixup_ivybridge: dec rdx - mov r13, -4389456576512 + mov r13d, -1022 + shl r13, 32 mov rax, rdx shr rdx, 19 shr rax, 20 mov rcx, rdx sub rcx, rax add rax, r13 - mov r13, 4389456576511 + not r13 sub rcx, r13 mov r13d, -2147483647 imul rcx, rax diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc index a375a661f..cd8b43477 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc @@ -109,25 +109,24 @@ main_loop_ryzen: sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 mov r9d, r10d mov ecx, r10d xor r9d, 16 xor ecx, 32 xor r10d, 48 - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - movdqa xmm0, XMMWORD PTR [r10+rbx] movdqa xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm0, xmm4 + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] paddq xmm2, xmm3 paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [r9+rbx], xmm4 movdqa XMMWORD PTR [rcx+rbx], xmm2 movdqa XMMWORD PTR [r10+rbx], xmm1 @@ -163,8 +162,8 @@ sqrt_fixup_ryzen_ret: sqrt_fixup_ryzen: movq r9, xmm2 dec rdi - mov edx, -1022 - shl rdx, 32 + mov edx, -1022 + shl rdx, 32 mov rax, rdi shr rdi, 19 shr rax, 20 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc old mode 100644 new mode 100755 index a55004e42..d103cc2ee --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc @@ -109,25 +109,24 @@ main_loop_ryzen: sqrt_fixup_ryzen_ret: mov rax, rsi mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 mov r9d, r10d mov ecx, r10d xor r9d, 16 xor ecx, 32 xor r10d, 48 - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - movdqa xmm0, XMMWORD PTR [r10+rbx] movdqa xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm0, xmm4 + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] paddq xmm2, xmm3 paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [r9+rbx], xmm4 movdqa XMMWORD PTR [rcx+rbx], xmm2 movdqa XMMWORD PTR [r10+rbx], xmm1 @@ -163,14 +162,14 @@ sqrt_fixup_ryzen_ret: sqrt_fixup_ryzen: movd r9, xmm2 dec rdi - mov rdx, 4389456576511 + mov edx, -1022 + shl rdx, 32 mov rax, rdi shr rdi, 19 shr rax, 20 mov rcx, rdi sub rcx, rax - sub rcx, rdx - mov rdx, -4389456576512 + lea rcx, [rcx+rdx+1] add rax, rdx imul rcx, rax sub rcx, r9 From 5db405c27842b35fcdd3488db344d10095c51013 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 29 Sep 2018 23:31:20 +0200 Subject: [PATCH 47/77] cuda: implement cryptonight_v8 - introduce a new schema where two threads work together on one hash - update autoadjustment - remove an mistake where shared memory was shrinked for gpus < sm_70 --- xmrstak/backend/nvidia/minethd.cpp | 5 - xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 465 ++++++++++++------ .../backend/nvidia/nvcc_code/cuda_extra.cu | 27 +- 3 files changed, 322 insertions(+), 175 deletions(-) diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index dc9b5fccf..423cd201a 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -145,11 +145,6 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor std::vector* pvThreads = new std::vector(); auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - if(miner_algo == cryptonight_monero_v8) - { - std::cerr<<"ERROR: The CUDA backend is currently not supporting cryptonight_v8, please use `--openCLVendor NVIDIA` instead."<x)[0] = x0; + } + + __forceinline__ __device__ u64 operator^=(const u64& other) + { + uint2::x ^= other.x; + uint2::y ^= other.y; + + return *this; + } + + __forceinline__ __device__ u64 operator+(const u64& other) const + { + u64 tmp; + ((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0]; + + return tmp; + } + + __forceinline__ __device__ u64 operator+=(const uint64_t& other) + { + return ((uint64_t*)&this->x)[0] += other; + } + + __forceinline__ __device__ void print(int i) const + { + if(i<2) + printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]); + } +}; + + template #ifdef XMR_STAK_THREADS -__launch_bounds__( XMR_STAK_THREADS * 4 ) +__launch_bounds__( XMR_STAK_THREADS * 2 ) #endif -__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, +__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { __shared__ uint32_t sharedMemory[1024]; cn_aes_gpu_init( sharedMemory ); + uint32_t* RCP; if(ALGO == cryptonight_monero_v8) { @@ -226,6 +277,195 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti RCP = RCP_shared; } +#if( __CUDA_ARCH__ < 300 ) + extern __shared__ u64 externShared[]; + // 8 x 64bit values + u64* myChunks = (u64*)(externShared + (threadIdx.x >> 1) * 8); + volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); +#else + extern __shared__ u64 chunkMem[]; + volatile uint32_t* sPtr = NULL; + // 8 x 64bit values + u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8); + +#endif + + __syncthreads( ); + + const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = tid >> 1; + const uint32_t sub = tid & 1; + + if ( thread >= threads ) + return; + + uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY]; + + u64 ax0 = ((u64*)(d_ctx_a + thread * 4))[sub]; + u64 bx0; + uint32_t idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); + + u64* ptr0; + + u64 bx1; + uint32_t sqrt_result; + uint64_t division_result; + if(ALGO == cryptonight_monero_v8) + { + bx0 = ((u64*)(d_ctx_b + thread * 12))[sub]; + bx1 = ((u64*)(d_ctx_b + thread * 12 + 4))[sub]; + + division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; + } + else + bx0 = ((u64*)(d_ctx_b + thread * 4))[sub]; + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int start = partidx * batchsize; + const int end = start + batchsize; + + for(int i = start; i < end; ++i) + { + ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; + + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + + uint32_t idx1 = (idx0 & 0x30) >> 3; + + const u64 cx = myChunks[ idx1 + sub ]; + const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; + + u64 cx_aes = ax0 ^ u64( + t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), + t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) + ); + + if(ALGO == cryptonight_monero_v8) + { + + const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ]; + const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; + myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; + myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + } + + myChunks[ idx1 + sub ] = cx_aes ^ bx0; + for(int x = 0; x < 8; x += 2) + ptr0[ x + sub ] = myChunks[x + sub]; + + idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); + idx1 = (idx0 & 0x30) >> 3; + ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + + if(ALGO != cryptonight_monero_v8) + bx0 = cx_aes; + + uint64_t cx_mul; + ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); + ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); + + if(ALGO == cryptonight_monero_v8 && sub == 1) + { + // Use division and square root results from the _previous_ iteration to hide the latency + ((uint32_t*)&division_result)[1] ^= sqrt_result; + + ((uint64_t*)myChunks)[ idx1 ] ^= division_result; + + const uint32_t dd = (static_cast(cx_mul) + (sqrt_result << 1)) | 0x80000001UL; + division_result = fast_div_v2(RCP, cx_aes, dd); + + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(cx_mul + division_result); + } +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ]; + + { + uint64_t cl = ((uint64_t*)myChunks)[ idx1 ]; + // sub 0 -> hi, sub 1 -> lo + uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; + if(ALGO == cryptonight_monero_v8) + { + const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; + u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; + res ^= ((uint64_t*)&chunk2)[0]; + const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; + myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; + myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + } + ax0 += res; + } + if(ALGO == cryptonight_monero_v8) + { + bx1 = bx0; + bx0 = cx_aes; + } + myChunks[ idx1 + sub ] = ax0; + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + ax0 ^= c; + idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); + } + + if ( bfactor > 0 ) + { + ((u64*)(d_ctx_a + thread * 4))[sub] = ax0; + if(ALGO == cryptonight_monero_v8) + { + ((u64*)(d_ctx_b + thread * 12))[sub] = bx0; + ((u64*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; + + if(sub == 1) + { + // must be valid only for `sub == 1` + ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result; + (d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result; + } + } + else + ((u64*)(d_ctx_b + thread * 12))[sub] = bx0; + } +} + +template +#ifdef XMR_STAK_THREADS +__launch_bounds__( XMR_STAK_THREADS * 4 ) +#endif +__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, + uint32_t startNonce, uint32_t * __restrict__ d_input ) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -272,20 +512,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = *(d_ctx_b + threads * 4 + thread); } } - - uint32_t bx1, sqrt_result; - uint64_t division_result; - if(ALGO == cryptonight_monero_v8) - { - d[1] = (d_ctx_b + thread * 12)[sub]; - bx1 = (d_ctx_b + thread * 12 + 4)[sub]; - - // must be valid only for `sub < 2` - division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; - sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; - } - else - d[1] = (d_ctx_b + thread * 4)[sub]; + d[1] = (d_ctx_b + thread * 4)[sub]; #pragma unroll 2 for ( i = start; i < end; ++i ) @@ -294,7 +521,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti for ( int x = 0; x < 2; ++x ) { j = ( ( idx0 & MASK ) >> 2 ) + sub; - + if(ALGO == cryptonight_bittube2) { uint32_t k[4]; @@ -325,57 +552,6 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti } } } - else if(ALGO == cryptonight_monero_v8) - { - - const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); - uint4 chunk0{}; - chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0); - chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0); - chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0); - chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0); - - const uint32_t x_0 = ((uint32_t*)&chunk0)[sub]; - const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4]; - const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4]; - const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4]; - d[x] = a ^ - t_fn0( x_0 & 0xff ) ^ - t_fn1( (x_1 >> 8) & 0xff ) ^ - t_fn2( (x_2 >> 16) & 0xff ) ^ - t_fn3( ( x_3 >> 24 ) ); - - uint4 value; - const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); - if(sub == 1) - ((uint64_t*)&value)[0] = tmp10; - const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); - if(sub == 1) - ((uint64_t*)&value)[1] = tmp20; - const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); - if(sub == 2) - ((uint64_t*)&value)[0] = tmp11; - const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); - if(sub == 2) - ((uint64_t*)&value)[1] = tmp21; - const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); - if(sub == 3) - ((uint64_t*)&value)[0] = tmp12; - const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); - if(sub == 3) - ((uint64_t*)&value)[1] = tmp22; - - if(sub > 0) - { - uint4 store{}; - ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; - ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; - - const int dest = sub + 1; - const int dest2 = dest == 4 ? 1 : dest; - *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; - } - } else { const uint32_t x_0 = loadGlobal32( long_state + j ); @@ -388,6 +564,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti t_fn2( (x_2 >> 16) & 0xff ) ^ t_fn3( ( x_3 >> 24 ) ); } + //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); @@ -416,62 +593,10 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); - - if(ALGO == cryptonight_monero_v8 ) - { - // Use division and square root results from the _previous_ iteration to hide the latency - const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); - ((uint32_t*)&division_result)[1] ^= sqrt_result; - - if(sub < 2) - *((uint64_t*)yy) ^= division_result; - - const uint32_t dd = (static_cast(cx0) + (sqrt_result << 1)) | 0x80000001UL; - const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); - division_result = fast_div_v2(RCP, cx1, dd); - - // Use division_result as an input for the square root to prevent parallel implementation in hardware - sqrt_result = fast_sqrt_v2(cx0 + division_result); - } - uint32_t zz[2]; zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); - // Shuffle the other 3x16 byte chunks in the current 64-byte cache line - if(ALGO == cryptonight_monero_v8) - { - uint4 value; - const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); - if(sub == 1) - ((uint64_t*)&value)[0] = tmp10; - const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); - if(sub == 1) - ((uint64_t*)&value)[1] = tmp20; - const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); - if(sub == 2) - ((uint64_t*)&value)[0] = tmp11; - const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); - if(sub == 2) - ((uint64_t*)&value)[1] = tmp21; - const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); - if(sub == 3) - ((uint64_t*)&value)[0] = tmp12; - const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); - if(sub == 3) - ((uint64_t*)&value)[1] = tmp22; - if(sub > 0) - { - const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); - uint4 store{}; - ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; - ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; - - const int dest = sub + 1; - const int dest2 = dest == 4 ? 1 : dest; - *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; - } - } - + t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) @@ -521,31 +646,13 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = (~d) ^ q; } - if(ALGO == cryptonight_monero_v8) - { - bx1 = d[(x + 1) % 2]; - } } } if ( bfactor > 0 ) { (d_ctx_a + thread * 4)[sub] = a; - if(ALGO == cryptonight_monero_v8) - { - (d_ctx_b + thread * 12)[sub] = d[1]; - (d_ctx_b + thread * 12 + 4)[sub] = bx1; - - if(sub < 2) - { - // must be valid only for `sub < 2` - (d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result; - (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result; - } - } - else - (d_ctx_b + thread * 4)[sub] = d[1]; - + (d_ctx_b + thread * 4)[sub] = d[1]; if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; @@ -608,6 +715,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); dim3 block( ctx->device_threads ); + dim3 block2( ctx->device_threads << 2 ); dim3 block4( ctx->device_threads << 2 ); dim3 block8( ctx->device_threads << 3 ); @@ -638,25 +746,53 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) for ( int i = 0; i < partcount; i++ ) { - CUDA_CHECK_MSG_KERNEL( - ctx->device_id, - "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2<<< - grid, - block4, - block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ctx->device_blocks*ctx->device_threads, - ctx->device_bfactor, - i, - ctx->d_long_state, - ctx->d_ctx_a, - ctx->d_ctx_b, - ctx->d_ctx_state, - nonce, - ctx->d_input - ) - ); + if(ALGO == cryptonight_monero_v8) + { + // two threads per block + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", + cryptonight_core_gpu_phase2_double<<< + grid, + block2, + sizeof(uint64_t) * block2.x * 8 + + // shuffle memory for fermi gpus + block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ctx->device_blocks*ctx->device_threads, + ctx->device_bfactor, + i, + ctx->d_long_state, + ctx->d_ctx_a, + ctx->d_ctx_b, + ctx->d_ctx_state, + nonce, + ctx->d_input + ) + ); + } + else + { + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", + cryptonight_core_gpu_phase2_quad<<< + grid, + block4, + block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ctx->device_blocks*ctx->device_threads, + ctx->device_bfactor, + i, + ctx->d_long_state, + ctx->d_ctx_a, + ctx->d_ctx_b, + ctx->d_ctx_state, + nonce, + ctx->d_input + ) + ); + } if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } @@ -700,7 +836,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } - else if(miner_algo == cryptonight_lite) + /*else if(miner_algo == cryptonight_lite) { cryptonight_core_gpu_hash(ctx, startNonce); } @@ -722,10 +858,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t } else if(miner_algo == cryptonight_haven) { - cryptonight_core_gpu_hash(ctx, startNonce); + cryptonight_core_gpu_hash(ctx, startNonce); } else if(miner_algo == cryptonight_bittube2) { - cryptonight_core_gpu_hash(ctx, startNonce); + cryptonight_core_gpu_hash(ctx, startNonce); } + */ } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 1ea54ddba..a4d88f21f 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -283,13 +283,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) break; }; - const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1]; - /* Disable L1 cache for GPUs before Volta. - * L1 speed is increased and latency reduced with Volta. - */ - if(gpuArch < 70) - CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); + // prefer shared memory over L1 cache + CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); size_t hashMemSize = std::max( cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), @@ -691,6 +687,25 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_threads = 64; } + // check if cryptonight_monero_v8 is selected for the user pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // overwrite default config if cryptonight_monero_v8 is mined + if(useCryptonight_v8) + { + // 4 based on my test maybe it must be adjusted later + size_t threads = 4; + // 8 is chosen by checking the occupancy calculator + size_t blockOptimal = 8 * ctx->device_mpcount; + + if(blockOptimal * threads * hashMemSize < limitedMemory) + { + ctx->device_threads = threads; + ctx->device_blocks = blockOptimal; + } + } } printf("device init succeeded\n"); From 010cbd98bd618a70898aca14426c80d9ef963150 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 30 Sep 2018 22:11:29 +0200 Subject: [PATCH 48/77] cpu: fix missing `asm` autoadjust In the auto adjust without hwlock the asm entry was missing --- xmrstak/backend/cpu/autoAdjust.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 28ff515d4..b192ddc35 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -58,7 +58,7 @@ class autoAdjust if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048)) printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size); - conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n"); printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down."); } else From 22e63ceb33d0ed71c26db94a4e22a608f57d28f1 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 1 Oct 2018 20:03:31 +0200 Subject: [PATCH 49/77] remove using of type `uint` `uint` is unknown in windows, therefore switch to the better type `uint32_t` --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index a6501a9fb..a7bdaca5e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -210,7 +210,7 @@ struct u64 : public uint2 __forceinline__ __device__ u64(){} - __forceinline__ __device__ u64( const uint x0, const uint x1) + __forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1) { uint2::x = x0; uint2::y = x1; From f27ea67e72f8ab75292ba96293fdce277d0aa3cd Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 1 Oct 2018 20:07:38 +0200 Subject: [PATCH 50/77] add CUDA 10.0 support - extent MSVC workaround for CUDA to 10.0 - add compute architecture 75 if CUDA 10.0 is found --- CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b385d..3b371b560 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,6 +100,11 @@ if(CUDA_ENABLE) list(APPEND DEFAULT_CUDA_ARCH "70") endif() endif() + # add Turing support for CUDA >= 10.0 + if(NOT CUDA_VERSION VERSION_LESS 10.0) + list(APPEND DEFAULT_CUDA_ARCH "75") + endif() + set(CUDA_ARCH "${DEFAULT_CUDA_ARCH}" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')") # generate comma separated list with architectures @@ -186,7 +191,10 @@ if(CUDA_ENABLE) endif() if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND - (CUDA_VERSION VERSION_EQUAL 9.0 OR CUDA_VERSION VERSION_EQUAL 9.1 OR CUDA_VERSION VERSION_EQUAL 9.2) + (CUDA_VERSION VERSION_EQUAL 9.0 OR + CUDA_VERSION VERSION_EQUAL 9.1 OR + CUDA_VERSION VERSION_EQUAL 9.2 OR + CUDA_VERSION VERSION_EQUAL 10.0) ) # workaround find_package(CUDA) is using the wrong path to the CXX host compiler # overwrite the CUDA host compiler variable with the used CXX MSVC From 25634d4aab915c48c6deaf574990b72c5954454e Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 1 Oct 2018 22:02:16 +0200 Subject: [PATCH 51/77] cpu: asm double hash - restructe asm preparation function - add double hash asm code --- ..._v8_double_main_loop_sandybridge_linux.inc | 410 ++++++++++++++++++ ..._v8_double_main_loop_sandybridge_win64.inc | 410 ++++++++++++++++++ .../cpu/crypto/asm/cryptonight_v8_main_loop.S | 10 + .../crypto/asm/cryptonight_v8_main_loop.asm | 7 + .../backend/cpu/crypto/cryptonight_aesni.h | 68 ++- xmrstak/backend/cpu/minethd.cpp | 21 +- 6 files changed, 904 insertions(+), 22 deletions(-) create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc new file mode 100644 index 000000000..79adab671 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + +ALIGN 16 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc new file mode 100644 index 000000000..ad8f18233 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movd xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movd xmm7, rax + + mov eax, 1 + shl rax, 52 + movd xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movd xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movd xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movd xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movd xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movd xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 64 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movd xmm0, r11 + movd xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movd r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movd xmm0, rbp + movd xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movd rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movd rdx, xmm5 + shl rdx, 32 + movd rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movd xmm0, rdx + xor rdx, [r11+r13] + movd xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movd r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movd r11, xmm0 + psrldq xmm1, 8 + movd r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movd rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movd rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movd r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movd r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movd r9, xmm10 + psrldq xmm1, 8 + movd r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movd r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movd xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movd r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movd xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S index b6be9438f..c0a3d0b41 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S @@ -9,6 +9,7 @@ #endif .global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm) .global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm) +.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm) ALIGN 8 FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm): @@ -25,3 +26,12 @@ FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm): #include "cryptonight_v8_main_loop_ryzen_linux.inc" add rsp, 48 ret 0 + +ALIGN 16 +FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cryptonight_v8_double_main_loop_sandybridge_linux.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm index a1615e9bd..1f3d2e15c 100644 --- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm @@ -1,6 +1,7 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cryptonight_v8_mainloop_ivybridge_asm PUBLIC cryptonight_v8_mainloop_ryzen_asm +PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm ALIGN 8 cryptonight_v8_mainloop_ivybridge_asm PROC @@ -14,5 +15,11 @@ cryptonight_v8_mainloop_ryzen_asm PROC ret 0 cryptonight_v8_mainloop_ryzen_asm ENDP +ALIGN 8 +cryptonight_v8_double_mainloop_sandybridge_asm PROC + INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc + ret 0 +cryptonight_v8_double_mainloop_sandybridge_asm ENDP + _TEXT_CNV8_MAINLOOP ENDS END diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index c0f122fd6..e8c0aca2b 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -940,21 +940,63 @@ struct Cryptonight_hash<5> extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); -template -void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + +template< size_t N, size_t asm_version> +struct Cryptonight_hash_asm; + +template +struct Cryptonight_hash_asm<1, asm_version> { - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 1; - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MEM = cn_select_memory(); - if (asm_version == 1) - cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); - else - cryptonight_v8_mainloop_ryzen_asm(ctx[0]); + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); - cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); - keccakf((uint64_t*)ctx[0]->hash_state, 24); - extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); -} + if(asm_version == 0) + cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); + else if(asm_version == 1) + cryptonight_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); + } +}; + +// double hash only for intel +template< > +struct Cryptonight_hash_asm<2, 0> +{ + static constexpr size_t N = 2; + + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MEM = cn_select_memory(); + + for(size_t i = 0; i < N; ++i) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + /* Optim - 99% time boundary */ + cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } + + cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + + for(size_t i = 0; i < N; ++i) + { + /* Optim - 90% time boundary */ + cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + /* Optim - 99% time boundary */ + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } + } +}; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index a344a9ffe..bb80b938f 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -455,24 +455,27 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc static_assert(N >= 1, "number of threads must be >= 1" ); // check for asm optimized version for cryptonight_v8 - if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) { if(asm_version_str != "off") { + if(asm_version_str != "intel" && asm_version_str != "ryzen") + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + if(asm_version_str == "intel") { // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) - return cryptonight_hash_v2_asm; + if(N == 1) + return Cryptonight_hash_asm<1u, 0u>::template hash; + else if(N == 2) + return Cryptonight_hash_asm<2u, 0u>::template hash; } - if(asm_version_str == "ryzen") + // supports only 1 thread per hash + if(N == 1 && asm_version_str == "ryzen") { // AMD Ryzen (1xxx and 2xxx series) - return cryptonight_hash_v2_asm; - } - else - { - printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); - } + return Cryptonight_hash_asm<1u, 1u>::template hash; + } } } // We have two independent flag bits in the functions From 1e5bb803a472b21672a69f2b5287c916fbd80f1d Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 1 Oct 2018 23:03:22 +0200 Subject: [PATCH 52/77] re-enable algorithm for cuda I disabled a few algorithms for fatser compile and missed to re-enable them. --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index a7bdaca5e..0f6e47cca 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -836,7 +836,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } - /*else if(miner_algo == cryptonight_lite) + else if(miner_algo == cryptonight_lite) { cryptonight_core_gpu_hash(ctx, startNonce); } @@ -864,5 +864,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } - */ + } From 70f3e82526cdd88607c55f7fab14f57ff0a5aba8 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 3 Oct 2018 20:45:29 +0200 Subject: [PATCH 53/77] rework all currencies - introduce monero oct 2018 fork as currency `monero` - remove monero7 - change all dev pools - those miner monero7 to handle the fork to monero - if the dev pool can not handle the fork to monero the currency is fixed set to `monero` (we can only handle 2 different currencies for user and dev pool) - remove guards those prevent to use the currency `monero` --- doc/FAQ.md | 9 ++--- xmrstak/backend/amd/config.tpl | 2 +- xmrstak/jconf.cpp | 61 +++++++++++++--------------------- xmrstak/pools.tpl | 3 +- 4 files changed, 27 insertions(+), 48 deletions(-) diff --git a/doc/FAQ.md b/doc/FAQ.md index 2d2820166..f744e3d24 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -9,7 +9,6 @@ * [Virus Protection Alert](#virus-protection-alert) * [Change Currency to Mine](#change-currency-to-mine) * [How can I mine Monero](#how-can-i-mine-monero) -* [Why is Monero named monero7](#why-is-monero-named-monero7) * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed) * [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error) @@ -88,16 +87,12 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua If the miner is compiled for Monero and Aeon than you can change - the value `currency` in the config *or* - - start the miner with the [command line option](usage.md) `--currency monero7` or `--currency aeon7` + - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7` - run `xmr-stak --help` to see all supported currencies and algorithms ## How can I mine Monero -Set the value `currency` in `pools.txt` to `monero7`. - -## Why is Monero named monero7 - -To avoid configuration conflicts after the hard fork of Monero to the new POW with our old naming schema where all cryptonight currencies was selected by choosing `monero` as currency we decided to switch to the name `monero7`. +Set the value `currency` in `pools.txt` to `monero`. ## Which currency must be chosen if my fork coin is not listed diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 18ef8c696..0b5dcf863 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -10,7 +10,7 @@ R"===(// generated by XMRSTAK_VERSION * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks - * (not allowed for cryptonight_v8 and monero8) + * (not allowed for cryptonight_v8 and monero) * 0 or false = use a contiguous block of memory per thread * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 355da8e6e..b608c0028 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -87,30 +87,29 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion - { "aeon7", {cryptonight_aeon, cryptonight_lite, 7u}, {cryptonight_aeon, cryptonight_lite, 7u}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {cryptonight_aeon, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "bittube", {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, - { "cryptonight", {cryptonight_monero, cryptonight, 255u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, - { "cryptonight_masari", {cryptonight_monero, cryptonight_masari, 255u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, - { "cryptonight_haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "cryptonight_heavy", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, - { "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, - { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr }, - { "cryptonight_v7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr }, - { "graft", {cryptonight_monero, cryptonight, 8u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "haven", {cryptonight_haven, cryptonight_heavy, 3u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, - { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, - { "monero8", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, - { "qrl", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "stellite", {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "turtlecoin", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr } + { "aeon7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, "mine.aeon-pool.com:5555" }, + { "bbscoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "bittube", {cryptonight_heavy, cryptonight_bittube2, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, + { "cryptonight", {cryptonight_monero_v8, cryptonight, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, + { "cryptonight_masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, + { "cryptonight_haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "cryptonight_heavy", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "graft", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, + { "monero", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, + { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "turtlecoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } }; constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); @@ -326,13 +325,6 @@ bool jconf::IsOnAlgoList(std::string& needle) { std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower); - if(needle == "monero") - { - printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n" - "Please use 'monero7' (support auto switch to new POW) if you want to mine Monero, \nor name the coin that you want to mine."); - return false; - } - for(size_t i=0; i < coin_algo_size; i++) { if(needle == coins[i].coin_name) @@ -617,13 +609,6 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) for(size_t i=0; i < coin_algo_size; i++) { - if(ctmp == "monero") - { - printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n" - "Please use monero7 (support auto switch to new POW) if you want to mine Monero, or name the coin that you want to mine."); - return false; - } - if(ctmp == coins[i].coin_name) { currentCoin = coins[i]; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 59c4ba9d6..3e21f416d 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -28,8 +28,7 @@ POOLCONF], * haven (automatic switch with block version 3 to cryptonight_haven) * intense * masari - * monero7 - * monero8 (use this to support Monero's Oct 2018 fork) + * monero (use this to support Monero's Oct 2018 fork) * qrl - Quantum Resistant Ledger * ryo * turtlecoin From b926a476fafbf445d52970116379fdc9a53c16a6 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Thu, 4 Oct 2018 11:12:09 -0600 Subject: [PATCH 54/77] spelling+typo touch-ups --- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 5 +++-- xmrstak/backend/nvidia/minethd.cpp | 2 +- xmrstak/http/httpd.cpp | 6 +++--- xmrstak/http/webdesign.cpp | 2 +- xmrstak/http/webdesign.hpp | 2 +- xmrstak/pools.tpl | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 2bebf82d0..7180491f7 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } @@ -78,7 +78,8 @@ class autoAdjust catch(const std::runtime_error& err) { // \todo add fallback to default auto adjust - conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + conf += std::string(" { \"low_power_mode\" : false"); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n"); printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what()); } diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 423cd201a..0153eed19 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -277,7 +277,7 @@ void minethd::work_main() if((round_ctr++ & 0xF) == 0) { globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16); - // check if the job is still valid, there is a small posibility that the job is switched + // check if the job is still valid, there is a small possibility that the job is switched if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) break; } diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp index dba7d7cdb..ed9abc2bc 100644 --- a/xmrstak/http/httpd.cpp +++ b/xmrstak/http/httpd.cpp @@ -71,17 +71,17 @@ int httpd::req_handler(void * cls, if (username == NULL) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); - ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, MHD_NO); + ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO); MHD_destroy_response(rsp); return ret; } free(username); - ret = MHD_digest_auth_check(connection, sHttpAuthRelam, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300); + ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300); if (ret == MHD_INVALID_NONCE || ret == MHD_NO) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); - ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO); + ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO); MHD_destroy_response(rsp); return ret; } diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index d6ee66e8d..93e217519 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -113,7 +113,7 @@ extern const char sHtmlCssFile [] = size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1; -extern const char sHttpAuthRelam[] = "XMR-Stak-Miner"; +extern const char sHttpAuthRealm[] = "XMR-Stak-Miner"; extern const char sHttpAuthOpaque[] = "6c071f0df539e234cadbcd79164af7a594e23ab42bccb834df796aead6ce96e4"; extern const char sHtmlAccessDenied[] = diff --git a/xmrstak/http/webdesign.hpp b/xmrstak/http/webdesign.hpp index 48adfea98..bcbe5ae1d 100644 --- a/xmrstak/http/webdesign.hpp +++ b/xmrstak/http/webdesign.hpp @@ -7,7 +7,7 @@ extern size_t sHtmlCssSize; extern const char sHtmlAccessDenied[]; extern size_t sHtmlAccessDeniedSize; -extern const char sHttpAuthRelam[]; +extern const char sHttpAuthRealm[]; extern const char sHttpAuthOpaque[]; extern const char sHtmlCommonHeader[]; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 59c4ba9d6..37c532f3b 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -23,7 +23,7 @@ POOLCONF], * * aeon7 (use this for Aeon's new PoW) * bbscoin (automatic switch with block version 3 to cryptonight_v7) - * bittube (uses cyrptonight_bittube2 algorithm) + * bittube (uses cryptonight_bittube2 algorithm) * graft * haven (automatic switch with block version 3 to cryptonight_haven) * intense @@ -45,7 +45,7 @@ POOLCONF], * cryptonight_v7 * cryptonight_v8 * # 4MiB scratchpad memory - * cyrptonight_bittube2 + * cryptonight_bittube2 * cryptonight_haven * cryptonight_heavy */ From 17e0b06eb83da2403eab61c2f7f270b79f7a0b48 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Thu, 4 Oct 2018 11:26:33 -0600 Subject: [PATCH 55/77] whitespace trims --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 6 +++--- xmrstak/backend/amd/config.tpl | 2 +- xmrstak/backend/amd/jconf.cpp | 2 +- xmrstak/backend/amd/minethd.cpp | 2 +- xmrstak/backend/amd/minethd.hpp | 2 +- xmrstak/backend/cpu/crypto/cryptonight_aesni.h | 2 +- xmrstak/backend/cpu/minethd.cpp | 2 +- xmrstak/backend/globalStates.cpp | 6 +++--- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 14 +++++++------- xmrstak/cpputil/read_write_lock.h | 4 ++-- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index e65f0ed05..fd630aff3 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -569,7 +569,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, #else # define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) #endif - + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 @@ -581,7 +581,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong a[2]; // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -813,7 +813,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; idx0 = ((~d) ^ q) & MASK; #endif - + } } mem_fence(CLK_GLOBAL_MEM_FENCE); diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 18ef8c696..98d90abe0 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -22,7 +22,7 @@ R"===(// generated by XMRSTAK_VERSION * in this case set the intensity to a multiple of the worksize or activate comp_mode. * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, * ], * If you do not wish to mine with your AMD GPU(s) then use: diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index fb1a04b4c..fab91d7e3 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -150,7 +150,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) } cfg.memChunk = (int)memChunk->GetInt64(); - + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128) { printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two"); diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 5ac246335..45979cbd6 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -236,7 +236,7 @@ void minethd::work_main() if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) break; } - + cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 04c2ff8ad..32e66ec87 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -31,7 +31,7 @@ class minethd : public iBackend void work_main(); uint64_t iJobNo; - + miner_work oWork; std::promise order_fix; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index e8c0aca2b..2b1741764 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -797,7 +797,7 @@ template< > struct Cryptonight_hash<1> { static constexpr size_t N = 1; - + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index bb80b938f..3a94daa5f 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -475,7 +475,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc { // AMD Ryzen (1xxx and 2xxx series) return Cryptonight_hash_asm<1u, 1u>::template hash; - } + } } } // We have two independent flag bits in the functions diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp index 3bd7d0eea..4eeed3c4b 100644 --- a/xmrstak/backend/globalStates.cpp +++ b/xmrstak/backend/globalStates.cpp @@ -39,7 +39,7 @@ void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId) threadWork = oGlobalWork; currentJobId = iGlobalJobNo.load(std::memory_order_relaxed); - + jobLock.UnLock(); } @@ -51,7 +51,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat) * To avoid duplicated shared this must be done before the nonce is exchanged. */ iGlobalJobNo++; - + size_t xid = dat.pool_id; dat.pool_id = pool_id; pool_id = xid; @@ -62,7 +62,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat) */ dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed); oGlobalWork = pWork; - + jobLock.UnLock(); } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 0f6e47cca..cceca876d 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -287,7 +287,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in volatile uint32_t* sPtr = NULL; // 8 x 64bit values u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8); - + #endif __syncthreads( ); @@ -344,10 +344,10 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) ); - + if(ALGO == cryptonight_monero_v8) { - + const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ]; const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; @@ -376,7 +376,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in if(ALGO != cryptonight_monero_v8) bx0 = cx_aes; - + uint64_t cx_mul; ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); @@ -400,7 +400,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in __syncthreads( ); #endif uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ]; - + { uint64_t cl = ((uint64_t*)myChunks)[ idx1 ]; // sub 0 -> hi, sub 1 -> lo @@ -426,7 +426,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in { bx1 = bx0; bx0 = cx_aes; - } + } myChunks[ idx1 + sub ] = ax0; for(int x = 0; x < 8; x += 2) { @@ -864,5 +864,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } - + } diff --git a/xmrstak/cpputil/read_write_lock.h b/xmrstak/cpputil/read_write_lock.h index 9139dfd22..51f42a2e6 100644 --- a/xmrstak/cpputil/read_write_lock.h +++ b/xmrstak/cpputil/read_write_lock.h @@ -61,7 +61,7 @@ class RWLock std::unique_lock lck(mtx_); if (status_ == -1) { status_ = 0; - } + } else { status_ -= 1; @@ -72,7 +72,7 @@ class RWLock { write_cv_.notify_one(); } - } + } else { read_cv_.notify_all(); From 21ce03855d168b624f2fda67ad5ac933b3c6b74c Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Thu, 4 Oct 2018 21:35:21 +0200 Subject: [PATCH 56/77] add cpu family and model detection Helper functions to select the asm version based on the number of used hashes per threads and the family name of the cpu. - use the noew cpu type functions to fix the wrong AMD family detection in `autoAdjust.hpp` - allow to set the asm version to `auto` - rename asm option `intel` to `intel_avx` - rename asm option `ryzen` to `amd_avx` Co-authored-by: fireice-uk --- xmrstak/backend/cpu/autoAdjust.hpp | 14 ++--- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 2 +- xmrstak/backend/cpu/config.tpl | 13 ++-- xmrstak/backend/cpu/cpuType.cpp | 79 ++++++++++++++++++++++++ xmrstak/backend/cpu/cpuType.hpp | 32 ++++++++++ xmrstak/backend/cpu/minethd.cpp | 81 +++++++++++++++++-------- 6 files changed, 179 insertions(+), 42 deletions(-) create mode 100644 xmrstak/backend/cpu/cpuType.cpp create mode 100644 xmrstak/backend/cpu/cpuType.hpp diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index b192ddc35..e7f3e9148 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -7,6 +7,7 @@ #include "xmrstak/misc/configEditor.hpp" #include "xmrstak/params.hpp" #include "xmrstak/backend/cryptonight.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" #include #ifdef _WIN32 @@ -20,14 +21,6 @@ namespace xmrstak { namespace cpu { -// Mask bits between h and l and return the value -// This enables us to put in values exactly like in the manual -// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22) -inline int32_t get_masked(int32_t val, int32_t h, int32_t l) -{ - val &= (0x7FFFFFFF >> (31-(h-l))) << l; - return val >> l; -} class autoAdjust { @@ -82,7 +75,7 @@ class autoAdjust conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); @@ -143,7 +136,8 @@ class autoAdjust L3KB_size = get_masked(cpu_info[3], 31, 18) * 512; ::jconf::cpuid(1, 0, cpu_info); - if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen + + if(getModel().family < 0x17) //0x17h is Zen old_amd = true; return true; diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 7180491f7..b61582588 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 37158d6e2..1a64860e4 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -11,10 +11,11 @@ R"===(// generated by XMRSTAK_VERSION * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [off, intel, ryzen] - * - off: used the default implementation (no assembler version) - * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) - * - ryzen: AMD Ryzen (1xxx and 2xxx series) + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, off, intel_avx, amd_avx] + * - auto: xmr-stak will automatically detect the asm type (default) + * - off: disable the usage of optimized assembler + * - intel_avx: supports Intel cpus with avx instructions e.g. Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx + * - amd_avx: supports AMD cpus with avx instructions e.g. AMD Ryzen 1xxx and 2xxx series * * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting @@ -27,8 +28,8 @@ R"===(// generated by XMRSTAK_VERSION * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp new file mode 100644 index 000000000..5959b75cc --- /dev/null +++ b/xmrstak/backend/cpu/cpuType.cpp @@ -0,0 +1,79 @@ + +#include "xmrstak/backend/cpu/cpuType.hpp" + +#include +#include +#include + +#ifdef _WIN32 +#define strcasecmp _stricmp +#include +#else +#include +#endif + +namespace xmrstak +{ +namespace cpu +{ + void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) + { + std::memset(val, 0, sizeof(int32_t)*4); + + #ifdef _WIN32 + __cpuidex(val, eax, ecx); + #else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); + #endif + } + + int32_t get_masked(int32_t val, int32_t h, int32_t l) + { + val &= (0x7FFFFFFF >> (31-(h-l))) << l; + return val >> l; + } + + bool has_feature(int32_t val, int32_t bit) + { + int32_t mask = 1 << bit; + return (val & mask) != 0u; + + } + + Model getModel() + { + int32_t cpu_info[4]; + char cpustr[13] = {0}; + + cpuid(0, 0, cpu_info); + std::memcpy(cpustr, &cpu_info[1], 4); + std::memcpy(cpustr+4, &cpu_info[3], 4); + std::memcpy(cpustr+8, &cpu_info[2], 4); + + Model result; + + cpuid(1, 0, cpu_info); + + result.family = get_masked(cpu_info[0], 12, 8); + result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4; + result.type_name = cpustr; + + // feature bits https://en.wikipedia.org/wiki/CPUID + // sse2 + result.sse2 = has_feature(cpu_info[3], 26); + // aes-ni + result.aes = has_feature(cpu_info[2], 25); + // avx + result.avx = has_feature(cpu_info[2], 28); + + if(strcmp(cpustr, "AuthenticAMD") == 0) + { + if(result.family == 0xF) + result.family += get_masked(cpu_info[0], 28, 20); + } + + return result; + } + +} // namespace cpu +} // namespace xmrstak diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp new file mode 100644 index 000000000..7f6bfaf51 --- /dev/null +++ b/xmrstak/backend/cpu/cpuType.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + + +namespace xmrstak +{ +namespace cpu +{ + struct Model + { + uint32_t family = 0u; + uint32_t model = 0u; + bool aes = false; + bool sse2 = false; + bool avx = false; + std::string type_name = "unknown"; + }; + + Model getModel(); + + /** Mask bits between h and l and return the value + * + * This enables us to put in values exactly like in the manual + * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22) + */ + int32_t get_masked(int32_t val, int32_t h, int32_t l); + + +} // namespace cpu +} // namespace xmrstak diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 3a94daa5f..795ed1b65 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -27,6 +27,7 @@ #include "xmrstak/backend/iBackend.hpp" #include "xmrstak/backend/globalStates.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" #include "xmrstak/params.hpp" #include "jconf.hpp" @@ -449,35 +450,33 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work return pvThreads; } +/** get the supported asm name + * + * @return asm type based on the number of hashes per thread the internal + * evaluated cpu type + */ +static std::string getAsmName(const uint32_t num_hashes) +{ + std::string asm_type = "off"; + if(num_hashes == 0) + return asm_type; + + auto cpu_model = getModel(); + + if(cpu_model.avx && cpu_model.aes) + { + if(cpu_model.type_name.find("Intel") != std::string::npos) + asm_type = "intel_avx"; + else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1) + asm_type = "amd_avx"; + } +} + template minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); - // check for asm optimized version for cryptonight_v8 - if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) - { - if(asm_version_str != "off") - { - if(asm_version_str != "intel" && asm_version_str != "ryzen") - printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); - - if(asm_version_str == "intel") - { - // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) - if(N == 1) - return Cryptonight_hash_asm<1u, 0u>::template hash; - else if(N == 2) - return Cryptonight_hash_asm<2u, 0u>::template hash; - } - // supports only 1 thread per hash - if(N == 1 && asm_version_str == "ryzen") - { - // AMD Ryzen (1xxx and 2xxx series) - return Cryptonight_hash_asm<1u, 1u>::template hash; - } - } - } // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -584,7 +583,39 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc digit.set(0, !bHaveAes); digit.set(1, !bNoPrefetch); - return func_table[ algv << 2 | digit.to_ulong() ]; + auto selected_function = func_table[ algv << 2 | digit.to_ulong() ]; + + + // check for asm optimized version for cryptonight_v8 + if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) + { + std::string selected_asm = asm_version_str; + if(selected_asm == "auto") + selected_asm = cpu::getAsmName(N); + + if(selected_asm != "off") + { + if(selected_asm == "intel_avx") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + if(N == 1) + selected_function = Cryptonight_hash_asm<1u, 0u>::template hash; + else if(N == 2) + selected_function = Cryptonight_hash_asm<2u, 0u>::template hash; + } + // supports only 1 thread per hash + if(N == 1 && selected_asm == "amd_avx") + { + // AMD Ryzen (1xxx and 2xxx series) + selected_function = Cryptonight_hash_asm<1u, 1u>::template hash; + } + if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx")) + printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str()); + else if(selected_asm != "intel_avx" || selected_asm != "amd_avx") // unknown asm type + printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); + } + } + return selected_function; } minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) From 5df8075715d7d8d06f45994e2462eac6a7ae16a6 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 5 Oct 2018 20:42:19 +0200 Subject: [PATCH 57/77] fix wrong option in config.tpl In #1839 the option for slow memory is sneaky changed. This can lead into crashes on linux systems where the user is not allowed to use large pages. --- xmrstak/config.tpl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index deb52aa09..96f0e9cb2 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -104,8 +104,7 @@ R"===(// generated by XMRSTAK_VERSION * It will never use slow memory, but it won't attempt to mlock---LINUX * never - If we fail to allocate large pages we will print an error and exit. */ -"use_slow_memory" : "warn",---WINDOWS -"use_slow_memory" : "no_mlck",---LINUX +"use_slow_memory" : "warn", /* * TLS Settings From 99a12cb6b155f27a8c62964efbdea37174224512 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 5 Oct 2018 21:05:18 +0200 Subject: [PATCH 58/77] CUDA: tine cryptonight_v8 Read memory in bigger chunks per thread to increase the used memory bandwith. Use for Kepla and Fermi GPUs the old autosuggestion instead of the new settings for cryptonight_v8. --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 21 +++++-------------- .../backend/nvidia/nvcc_code/cuda_extra.cu | 4 ++-- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 0f6e47cca..22bcf16eb 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -329,11 +329,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in { ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; - #pragma unroll 4 - for(int x = 0; x < 8; x += 2) - { - myChunks[x + sub] = ptr0[ x + sub ]; - } + ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; uint32_t idx1 = (idx0 & 0x30) >> 3; @@ -362,17 +358,13 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } myChunks[ idx1 + sub ] = cx_aes ^ bx0; - for(int x = 0; x < 8; x += 2) - ptr0[ x + sub ] = myChunks[x + sub]; + ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; - #pragma unroll 4 - for(int x = 0; x < 8; x += 2) - { - myChunks[x + sub] = ptr0[ x + sub ]; - } + + ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; if(ALGO != cryptonight_monero_v8) bx0 = cx_aes; @@ -428,10 +420,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in bx0 = cx_aes; } myChunks[ idx1 + sub ] = ax0; - for(int x = 0; x < 8; x += 2) - { - ptr0[ x + sub ] = myChunks[x + sub]; - } + ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; ax0 ^= c; idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index a4d88f21f..f136744d4 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -692,8 +692,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8; - // overwrite default config if cryptonight_monero_v8 is mined - if(useCryptonight_v8) + // overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0 + if(useCryptonight_v8 && gpuArch >= 50) { // 4 based on my test maybe it must be adjusted later size_t threads = 4; From 8e1e7447c2c7d61a1c2f016d5e285c9a6d65ae9f Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 5 Oct 2018 22:21:52 +0200 Subject: [PATCH 59/77] fix invalid shares With rocm we fighted very long with invalid shares. This is now solved with rocm 1.9 and this tiny fix. It is not fully clear where a memory optimization is kicking in and break the kernel `Groestl` if the variables `M` and `H` are not `volatile`. The performance ill not change with this fix. The fix is tested with rocm 1.9 with a VEGA64 and a RX570 --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index e65f0ed05..317352722 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -1221,7 +1221,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global #pragma unroll 4 for(uint i = 0; i < 4; ++i) { - ulong H[8], M[8]; + volatile ulong H[8], M[8]; if(i < 3) { From 2370aeef739fd3901359b00d562ec99625b5099e Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Fri, 5 Oct 2018 22:19:15 -0600 Subject: [PATCH 60/77] Fix two new warnings within new code --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- xmrstak/backend/cpu/minethd.cpp | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index e2c2dfeb8..2fe0350a7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -377,7 +377,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } std::vector openCLDriverVer(1024); - if(ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL) != CL_SUCCESS) + if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx ); return ERR_OCL_API; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 795ed1b65..912ef48bb 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -458,18 +458,19 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work static std::string getAsmName(const uint32_t num_hashes) { std::string asm_type = "off"; - if(num_hashes == 0) - return asm_type; - - auto cpu_model = getModel(); - - if(cpu_model.avx && cpu_model.aes) + if(num_hashes != 0) { - if(cpu_model.type_name.find("Intel") != std::string::npos) - asm_type = "intel_avx"; - else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1) - asm_type = "amd_avx"; + auto cpu_model = getModel(); + + if(cpu_model.avx && cpu_model.aes) + { + if(cpu_model.type_name.find("Intel") != std::string::npos) + asm_type = "intel_avx"; + else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1) + asm_type = "amd_avx"; + } } + return asm_type; } template From 746037d8fb33608224d6c2f17cbda91e5a328d3c Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 7 Oct 2018 10:05:57 +0200 Subject: [PATCH 61/77] OpenCL: fix definition range for unroll fix #1870 - remove zero from the valod definition range for the loop unroll option --- xmrstak/backend/amd/config.tpl | 2 +- xmrstak/backend/amd/jconf.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index b852a7e81..c3da93a38 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -15,7 +15,7 @@ R"===(// generated by XMRSTAK_VERSION * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) - * unroll - allow to control how often the POW main loop is unrolled; valid range [0;128) - for most OpenCL implementations it must be a power of two. + * unroll - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two. * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index fab91d7e3..152f8add4 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -151,9 +151,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) cfg.memChunk = (int)memChunk->GetInt64(); - if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128) + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || (int)unroll->GetInt64() == 0) { - printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two"); + printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and not zero"); return false; } cfg.unroll = (int)unroll->GetInt64(); From 1c0ef1548f1890cb80c5e41d12b42987ed3fb6a1 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 7 Oct 2018 21:30:48 +0200 Subject: [PATCH 62/77] fix crash with monero and strided_index Strided index 1 is not allowed for cryptonight_v8 and monero. In the case the dev pool is set to monero and the user tuned there settings for an other currency the miner will crash if strided index or memChunk is not fitting the requirement to mine monero. This PR detects wrong configurations and will set strided index and memChunk to a valid value but only for cryptonight_v8. The user pool settings will only be changed if monero or cryptonight_v8 is selected. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 32 ++++++++++++++--------------- xmrstak/backend/amd/config.tpl | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 2fe0350a7..7c7aff788 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -396,12 +396,26 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ int threadMemMask = cn_select_mask(miner_algo[ii]); int hashIterations = cn_select_iter(miner_algo[ii]); + size_t mem_chunk_exp = 1u << ctx->memChunk; + size_t strided_index = ctx->stridedIndex; + /* Adjust the config settings to a valid combination + * this is required if the dev pool is mining monero + * but the user tuned there settings for another currency + */ + if(miner_algo[ii] == cryptonight_monero_v8) + { + if(ctx->memChunk < 2) + mem_chunk_exp = 1u << 2; + if(strided_index == 1) + strided_index = 0; + } + std::string options; options += " -DITERATIONS=" + std::to_string(hashIterations); options += " -DMASK=" + std::to_string(threadMemMask); options += " -DWORKSIZE=" + std::to_string(ctx->workSize); - options += " -DSTRIDED_INDEX=" + std::to_string(ctx->stridedIndex); - options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(1u << ctx->memChunk); + options += " -DSTRIDED_INDEX=" + std::to_string(strided_index); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp); options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); options += " -DMEMORY=" + std::to_string(hashMemSize); options += " -DALGO=" + std::to_string(miner_algo[ii]); @@ -931,20 +945,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } - if(useCryptonight_v8) - { - if(ctx[i].stridedIndex == 1) - { - printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx); - return ERR_STUPID_PARAMS; - } - if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2) - { - printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx); - return ERR_STUPID_PARAMS; - } - } - if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index b852a7e81..49033c81b 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -10,7 +10,7 @@ R"===(// generated by XMRSTAK_VERSION * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks - * (not allowed for cryptonight_v8 and monero) + * (for cryptonight_v8 and monero it is equal to strided_index = 0) * 0 or false = use a contiguous block of memory per thread * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 From 53652d35e707493416e0cdd5f8cbd9479294ac42 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 7 Oct 2018 10:11:06 +0200 Subject: [PATCH 63/77] CPU: fix logical error Fix wrong warning about unknown ASM type --- xmrstak/backend/cpu/minethd.cpp | 59 +++++++++++++++++---------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 795ed1b65..ccf802e12 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -586,35 +586,36 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc auto selected_function = func_table[ algv << 2 | digit.to_ulong() ]; - // check for asm optimized version for cryptonight_v8 - if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) - { - std::string selected_asm = asm_version_str; - if(selected_asm == "auto") - selected_asm = cpu::getAsmName(N); - - if(selected_asm != "off") - { - if(selected_asm == "intel_avx") - { - // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) - if(N == 1) - selected_function = Cryptonight_hash_asm<1u, 0u>::template hash; - else if(N == 2) - selected_function = Cryptonight_hash_asm<2u, 0u>::template hash; - } - // supports only 1 thread per hash - if(N == 1 && selected_asm == "amd_avx") - { - // AMD Ryzen (1xxx and 2xxx series) - selected_function = Cryptonight_hash_asm<1u, 1u>::template hash; - } - if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx")) - printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str()); - else if(selected_asm != "intel_avx" || selected_asm != "amd_avx") // unknown asm type - printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); - } - } + // check for asm optimized version for cryptonight_v8 + if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) + { + std::string selected_asm = asm_version_str; + if(selected_asm == "auto") + selected_asm = cpu::getAsmName(N); + + if(selected_asm != "off") + { + if(selected_asm == "intel_avx") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + if(N == 1) + selected_function = Cryptonight_hash_asm<1u, 0u>::template hash; + else if(N == 2) + selected_function = Cryptonight_hash_asm<2u, 0u>::template hash; + } + // supports only 1 thread per hash + if(N == 1 && selected_asm == "amd_avx") + { + // AMD Ryzen (1xxx and 2xxx series) + selected_function = Cryptonight_hash_asm<1u, 1u>::template hash; + } + if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx")) + printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str()); + else if(selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type + printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); + } + } + return selected_function; } From eb8376faece53483f54cfa106254f11fab2d4d6d Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 8 Oct 2018 09:21:42 +0200 Subject: [PATCH 64/77] CUDA: use volatile pointer Use volatile pointer to be sure that the compiler is not caching the values. --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 2be6f969f..b844e10c8 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -278,15 +278,15 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } #if( __CUDA_ARCH__ < 300 ) - extern __shared__ u64 externShared[]; + extern __shared__ uint64_t externShared[]; // 8 x 64bit values - u64* myChunks = (u64*)(externShared + (threadIdx.x >> 1) * 8); + volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8); volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); #else - extern __shared__ u64 chunkMem[]; + extern __shared__ uint64_t chunkMem[]; volatile uint32_t* sPtr = NULL; // 8 x 64bit values - u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8); + volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8); #endif @@ -301,25 +301,25 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY]; - u64 ax0 = ((u64*)(d_ctx_a + thread * 4))[sub]; - u64 bx0; - uint32_t idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); + uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub]; + uint64_t bx0; + uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); - u64* ptr0; + uint64_t* ptr0; - u64 bx1; + uint64_t bx1; uint32_t sqrt_result; uint64_t division_result; if(ALGO == cryptonight_monero_v8) { - bx0 = ((u64*)(d_ctx_b + thread * 12))[sub]; - bx1 = ((u64*)(d_ctx_b + thread * 12 + 4))[sub]; + bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub]; + bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub]; division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; } else - bx0 = ((u64*)(d_ctx_b + thread * 4))[sub]; + bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); const int start = partidx * batchsize; @@ -327,7 +327,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in for(int i = start; i < end; ++i) { - ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; + ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; @@ -344,9 +344,9 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in if(ALGO == cryptonight_monero_v8) { - const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ]; - const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; - const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; + const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ]; + const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; #if (__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else @@ -362,7 +362,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; - ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0]; + ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; @@ -399,10 +399,10 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; if(ALGO == cryptonight_monero_v8) { - const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; - u64 chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; + uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; res ^= ((uint64_t*)&chunk2)[0]; - const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ]; + const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; #if (__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else @@ -422,16 +422,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in myChunks[ idx1 + sub ] = ax0; ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; ax0 ^= c; - idx0 = shuffle<2>(sPtr, sub, ax0.x, 0); + idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); } if ( bfactor > 0 ) { - ((u64*)(d_ctx_a + thread * 4))[sub] = ax0; + ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; if(ALGO == cryptonight_monero_v8) { - ((u64*)(d_ctx_b + thread * 12))[sub] = bx0; - ((u64*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; + ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; + ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; if(sub == 1) { @@ -441,7 +441,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } } else - ((u64*)(d_ctx_b + thread * 12))[sub] = bx0; + ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; } } From 9e592ec58211b91557f955718a02dc02f90981db Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 8 Oct 2018 14:48:22 +0200 Subject: [PATCH 65/77] compatibility and better messages - add more descriptive messages if memory allocation fails - add gnu compiler flags: `noexecstack` to support systemd - handle cases where memroy allocation fails Co-authored-by: Tony Butler --- CMakeLists.txt | 10 ++++++++-- xmrstak/backend/amd/minethd.cpp | 5 +++++ .../backend/cpu/crypto/cryptonight_common.cpp | 16 +++++++++------- xmrstak/backend/cpu/minethd.cpp | 8 ++++++++ xmrstak/cli/cli-miner.cpp | 1 + xmrstak/config.tpl | 5 ++--- 6 files changed, 33 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 712fb429e..b714ee0ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -443,6 +443,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() endif() +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "-Wl,-z,noexecstack ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}") +endif() + # activate static libgcc and libstdc++ linking if(CMAKE_LINK_STATIC) set(BUILD_SHARED_LIBRARIES OFF) @@ -464,7 +469,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC") else() # asm optimized monero v8 code enable_language(ASM) - set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY C) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP) + set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") add_library(xmr-stak-asm STATIC "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" @@ -587,4 +593,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) else() # this rule is used if the install prefix is the build directory install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")") -endif() \ No newline at end of file +endif() diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 45979cbd6..5e70f25a6 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -174,6 +174,11 @@ void minethd::work_main() cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); + if(cpu_ctx == nullptr) + { + printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory, miner will be stopped."); + win_exit(1); + } // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index a478c9b2a..a7e4696a8 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -216,6 +216,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize); ptr->ctx_info[0] = 0; ptr->ctx_info[1] = 0; + if(ptr->long_state == NULL) + printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str()); return ptr; } @@ -243,25 +245,25 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al return ptr; } #else - +//http://man7.org/linux/man-pages/man2/mmap.2.html #if defined(__APPLE__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); #elif defined(__FreeBSD__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0); #elif defined(__OpenBSD__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); #else - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0); + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); #endif if (ptr->long_state == MAP_FAILED) { _mm_free(ptr); - msg->warning = "mmap failed"; + msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'"; return NULL; } diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 912ef48bb..3e90159ea 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -226,6 +226,7 @@ bool minethd::self_test() { if ((ctx[i] = minethd_alloc_ctx()) == nullptr) { + printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); for (int j = 0; j < i; j++) cryptonight_free_ctx(ctx[j]); return false; @@ -683,6 +684,13 @@ void minethd::multiway_work_main() for (size_t i = 0; i < N; i++) { ctx[i] = minethd_alloc_ctx(); + if(ctx[i] == nullptr) + { + printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); + for (int j = 0; j < i; j++) + cryptonight_free_ctx(ctx[j]); + win_exit(1); + } piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24); piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr; } diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index ae39d2505..428952b1b 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -749,6 +749,7 @@ int main(int argc, char *argv[]) if (!BackendConnector::self_test()) { + printer::inst()->print_msg(L0, "Self test not passed!"); win_exit(); return 1; } diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index 96f0e9cb2..73ae054c2 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -94,9 +94,8 @@ R"===(// generated by XMRSTAK_VERSION * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX - */ - -/* + * + * * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here: * always - Don't even try to use large pages. Always use slow memory. * warn - We will try to use large pages, but fall back to slow memory if that fails. From 801556f693988045818d334d359045d8df26acc9 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 8 Oct 2018 21:15:20 +0200 Subject: [PATCH 66/77] select hash function from function array Use an array instead of a if cascade to select the hasing function for CUDA. --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 65 ++++++------------- 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index b844e10c8..8e69c2029 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -807,51 +807,26 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) } } +typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); + void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { - if(miner_algo == cryptonight_monero) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_monero_v8) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_heavy) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_lite) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_aeon) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_ipbc) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_stellite) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_masari) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_haven) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_bittube2) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - + if(miner_algo == invalid_algo) return; + + static const cuda_hash_fn func_table[] = { + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash + }; + + cuda_hash_fn selected_function = func_table[ miner_algo - 1u ]; + selected_function(ctx, startNonce); } From 594a5b4d5b515af2b4f66cf940c10e103ceee40a Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 8 Oct 2018 21:43:25 +0200 Subject: [PATCH 67/77] CUDA: add compatibility mode Add compatibility mode for CUDA to avoid invalid shares. --- xmrstak/backend/nvidia/autoAdjust.hpp | 1 + xmrstak/backend/nvidia/config.tpl | 3 + xmrstak/backend/nvidia/jconf.cpp | 14 ++- xmrstak/backend/nvidia/jconf.hpp | 1 + xmrstak/backend/nvidia/minethd.cpp | 1 + .../backend/nvidia/nvcc_code/cryptonight.hpp | 1 + xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 109 ++++++++++++++---- 7 files changed, 105 insertions(+), 25 deletions(-) diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 12468093c..6354f60f0 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -96,6 +96,7 @@ class autoAdjust " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + + " \"comp_mode\" : true,\n" + " },\n"; } } diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 144da80b9..e2a76d90f 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -16,6 +16,9 @@ R"===(// generated by XMRSTAK_VERSION * 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu * 2 = cudaDeviceScheduleYield * 3 = cudaDeviceScheduleBlockingSync (default) + * comp_mode - Compatibility if true it will use 64bit memory loads and if false it will use + * 128bit memory loads (can produce invalid results) + * (this option has only a meaning for cryptonight_v8 and monero) * * On the first run the miner will look at your system and suggest a basic configuration that will work, * you can try to tweak it from there to get the best performance. diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index c9d4f194c..b1059f359 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode; + const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *compMode; gid = GetObjectMember(oThdConf, "index"); blocks = GetObjectMember(oThdConf, "blocks"); threads = GetObjectMember(oThdConf, "threads"); @@ -131,9 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) bsleep = GetObjectMember(oThdConf, "bsleep"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); syncMode = GetObjectMember(oThdConf, "sync_mode"); + compMode = GetObjectMember(oThdConf, "comp_mode"); if(gid == nullptr || blocks == nullptr || threads == nullptr || - bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr) + bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr || + compMode == nullptr) { return false; } @@ -161,13 +163,19 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)"); return false; } + + if(!compMode->IsBool()) + return false; + + cfg.id = gid->GetInt(); cfg.blocks = blocks->GetInt(); cfg.threads = threads->GetInt(); cfg.bfactor = bfactor->GetInt(); cfg.bsleep = bsleep->GetInt(); cfg.syncMode = syncMode->GetInt(); - + cfg.compMode = compMode->GetBool(); + if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); else diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index b4ebaa035..5ee1f8133 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -29,6 +29,7 @@ class jconf bool bNoPrefetch; int32_t cpu_aff; int syncMode; + bool compMode; long long iCpuAff; }; diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 0153eed19..135f26ea9 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -78,6 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.device_bfactor = (int)cfg.bfactor; ctx.device_bsleep = (int)cfg.bsleep; ctx.syncMode = cfg.syncMode; + ctx.compMode = cfg.compMode; this->affinity = cfg.cpu_aff; std::future numa_guard = numa_promise.get_future(); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index d588641b4..8167395e3 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -16,6 +16,7 @@ typedef struct { int device_bfactor; int device_bsleep; int syncMode; + bool compMode; uint32_t *d_input; uint32_t inputlen; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 8e69c2029..1c9c9df64 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" @@ -254,8 +255,12 @@ struct u64 : public uint2 } }; - -template +/** cryptonight with two threads per hash + * + * @tparam COMP_MODE if true than 64bit memory transfers per thread will be used to store/load data within shared memory + * else 128bit operations will be used + */ +template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 2 ) #endif @@ -329,7 +334,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in { ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; - ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + } + else + ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; uint32_t idx1 = (idx0 & 0x30) >> 3; @@ -358,13 +372,31 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } myChunks[ idx1 + sub ] = cx_aes ^ bx0; - ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + } + else + ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; - ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + } + else + ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; if(ALGO != cryptonight_monero_v8) bx0 = cx_aes; @@ -420,7 +452,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in bx0 = cx_aes; } myChunks[ idx1 + sub ] = ax0; - ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; + if(COMP_MODE) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + } + else + ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; ax0 ^= c; idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); } @@ -699,7 +740,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } -template +template void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); @@ -741,7 +782,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2_double<<< + cryptonight_core_gpu_phase2_double<<< grid, block2, sizeof(uint64_t) * block2.x * 8 + @@ -807,26 +848,50 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) } } -typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); - void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { + typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); + if(miner_algo == invalid_algo) return; static const cuda_hash_fn func_table[] = { - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash }; - cuda_hash_fn selected_function = func_table[ miner_algo - 1u ]; + std::bitset<1> digit; + digit.set(0, ctx->compMode); + + cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; selected_function(ctx, startNonce); } From 58b7c66c06519f84328a7553459f99c77446b2f7 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 8 Oct 2018 22:16:49 +0200 Subject: [PATCH 68/77] improve error message Add a suggestion to an common line which is shown in the event of an crash under windows. --- xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index f136744d4..433e175dd 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -415,7 +415,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, } CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); - CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )); + CUDA_CHECK_MSG( + ctx->device_id, + "\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**", + cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ) + ); /* There is only a 32bit limit for the counter on the device side * therefore this value can be greater than 10, in that case limit rescount From b9eed59febf2ce7ed914382119559cd320c0e3a9 Mon Sep 17 00:00:00 2001 From: fireice-uk Date: Tue, 9 Oct 2018 20:58:33 +0100 Subject: [PATCH 69/77] Add Ryo sponsorship message --- README.md | 5 ++++- xmrstak/cli/cli-miner.cpp | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2fe1bc511..046a930e1 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,12 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [Intense](https://intensecoin.com) - [Masari](https://getmasari.org) - [QRL](https://theqrl.org) -- [Ryo](https://ryo-currency.com) +- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo** - [TurtleCoin](https://turtlecoin.lol) +Ryo currency is a way for us to implement the ideas that we were unable to in +Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details. + If your prefered coin is not listed, you can choose one of the following algorithms: - 1MiB scratchpad memory diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index 428952b1b..171e6dec9 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -783,11 +783,24 @@ int main(int argc, char *argv[]) char buffer[64]; snprintf(buffer, sizeof(buffer), "\nConfigurable dev donation level is set to %.1f%%\n\n", fDevDonationLevel * 100.0); printer::inst()->print_str(buffer); + printer::inst()->print_str("-------------------------------------------------------------------\n"); printer::inst()->print_str("You can use following keys to display reports:\n"); printer::inst()->print_str("'h' - hashrate\n"); printer::inst()->print_str("'r' - results\n"); printer::inst()->print_str("'c' - connection\n"); printer::inst()->print_str("-------------------------------------------------------------------\n"); + printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n"); + printer::inst()->print_str(" ##### ______ _____\n"); + printer::inst()->print_str(" ## ## | ___ \\ / __ \\\n"); + printer::inst()->print_str("# _ #| |_/ /_ _ ___ | / \\/ _ _ _ _ _ _ ___ _ __ ___ _ _\n"); + printer::inst()->print_str("# |_| #| /| | | | / _ \\ | | | | | || '_|| '_|/ _ \\| '_ \\ / __|| | | |\n"); + printer::inst()->print_str("# #| |\\ \\| |_| || (_) || \\__/\\| |_| || | | | | __/| | | || (__ | |_| |\n"); + printer::inst()->print_str(" ## ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_| |_| \\___||_| |_| \\___| \\__, |\n"); + printer::inst()->print_str(" ##### __/ | __/ |\n"); + printer::inst()->print_str(" |___/ https://ryo-currency.com |___/\n\n"); + printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n"); + printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n"); + printer::inst()->print_str("-------------------------------------------------------------------\n"); printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str()); if(params::inst().benchmark_block_version >= 0) From ed2168b48d16a9870cbef067d38a5ad16b26c9f9 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 10 Oct 2018 11:52:40 +0200 Subject: [PATCH 70/77] CUDA: fix invalid results If `comp_mode` is false the results on a windows platform will be invalid. The reason for that is that `ulong4` is in windows 16byte and in linux 32byte. thx @xmrig for finding and solving the issue fix #1873 --- xmrstak/backend/nvidia/autoAdjust.hpp | 2 +- xmrstak/backend/nvidia/config.tpl | 2 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 6354f60f0..27783acd1 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -96,7 +96,7 @@ class autoAdjust " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + - " \"comp_mode\" : true,\n" + + " \"comp_mode\" : false,\n" + " },\n"; } } diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index e2a76d90f..8803f6ff2 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -17,7 +17,7 @@ R"===(// generated by XMRSTAK_VERSION * 2 = cudaDeviceScheduleYield * 3 = cudaDeviceScheduleBlockingSync (default) * comp_mode - Compatibility if true it will use 64bit memory loads and if false it will use - * 128bit memory loads (can produce invalid results) + * 256bit memory loads (can produce invalid results) * (this option has only a meaning for cryptonight_v8 and monero) * * On the first run the miner will look at your system and suggest a basic configuration that will work, diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 1c9c9df64..3dce3e4ac 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -343,7 +343,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } } else - ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; + ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; uint32_t idx1 = (idx0 & 0x30) >> 3; @@ -381,7 +381,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } } else - ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; + ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub]; idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; @@ -396,7 +396,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } } else - ((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub]; + ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; if(ALGO != cryptonight_monero_v8) bx0 = cx_aes; @@ -461,7 +461,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } } else - ((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub]; + ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub]; ax0 ^= c; idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); } From b4387ac00dd6eec6ee1bef4736f02e646fa51428 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 10 Oct 2018 12:04:30 +0200 Subject: [PATCH 71/77] fix right bitshift in `amd_bitalign` In the current implementation the bit align is using signed integer which results in pulling in ones in the case the sign bit is set. - cast to unsigned integer before using bitshift --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 7689b5d3a..9c9bcd08e 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -35,8 +35,8 @@ R"===( inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) { uint2 result; - result.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2)); - result.s1 = (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2)); + result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2)); + result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2)); return result; } #endif From bd4a4c94290f23bb38a4163baa3582c99eb84513 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 10 Oct 2018 17:35:45 +0200 Subject: [PATCH 72/77] NVIDIA: rename config option `comp_mode` The name `comp_mode` for a memoy load pattern if a bad choosen name. Therefore I changed it to `mem_mode` which also gives use the possibility to add new mode later if needed. - rename `comp_mode` to `mem_mode` - fix documentation --- xmrstak/backend/nvidia/autoAdjust.hpp | 2 +- xmrstak/backend/nvidia/config.tpl | 8 +-- xmrstak/backend/nvidia/jconf.cpp | 15 +++-- xmrstak/backend/nvidia/jconf.hpp | 2 +- xmrstak/backend/nvidia/minethd.cpp | 2 +- .../backend/nvidia/nvcc_code/cryptonight.hpp | 2 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 64 +++++++++---------- 7 files changed, 49 insertions(+), 46 deletions(-) diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 27783acd1..2755e03d2 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -96,7 +96,7 @@ class autoAdjust " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + - " \"comp_mode\" : false,\n" + + " \"mem_mode\" : 1,\n" + " },\n"; } } diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 8803f6ff2..8a5982b57 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -16,9 +16,9 @@ R"===(// generated by XMRSTAK_VERSION * 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu * 2 = cudaDeviceScheduleYield * 3 = cudaDeviceScheduleBlockingSync (default) - * comp_mode - Compatibility if true it will use 64bit memory loads and if false it will use - * 256bit memory loads (can produce invalid results) - * (this option has only a meaning for cryptonight_v8 and monero) + * mem_mode - select the memory access pattern (this option has only a meaning for cryptonight_v8 and monero) + * 0 = 64bit memory loads + * 1 = 256bit memory loads * * On the first run the miner will look at your system and suggest a basic configuration that will work, * you can try to tweak it from there to get the best performance. @@ -27,7 +27,7 @@ R"===(// generated by XMRSTAK_VERSION * "gpu_threads_conf" : * [ * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - * "affine_to_cpu" : false, "sync_mode" : 3, + * "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1 * }, * ], * If you do not wish to mine with your nVidia GPU(s) then use: diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index b1059f359..6c443343b 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *compMode; + const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *memMode; gid = GetObjectMember(oThdConf, "index"); blocks = GetObjectMember(oThdConf, "blocks"); threads = GetObjectMember(oThdConf, "threads"); @@ -131,11 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) bsleep = GetObjectMember(oThdConf, "bsleep"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); syncMode = GetObjectMember(oThdConf, "sync_mode"); - compMode = GetObjectMember(oThdConf, "comp_mode"); + memMode = GetObjectMember(oThdConf, "mem_mode"); if(gid == nullptr || blocks == nullptr || threads == nullptr || bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr || - compMode == nullptr) + memMode == nullptr) { return false; } @@ -160,12 +160,15 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3) { - printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)"); + printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or not a number. ( range: 0 <= sync_mode < 4.)"); return false; } - if(!compMode->IsBool()) + if(!memMode->IsNumber() || memMode->GetInt() < 0 || memMode->GetInt() > 1) + { + printer::inst()->print_msg(L0, "Error NVIDIA: mem_mode out of range or not a number. (range: 0 or 1)"); return false; + } cfg.id = gid->GetInt(); @@ -174,7 +177,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) cfg.bfactor = bfactor->GetInt(); cfg.bsleep = bsleep->GetInt(); cfg.syncMode = syncMode->GetInt(); - cfg.compMode = compMode->GetBool(); + cfg.memMode = memMode->GetInt(); if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index 5ee1f8133..40b72f880 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -29,7 +29,7 @@ class jconf bool bNoPrefetch; int32_t cpu_aff; int syncMode; - bool compMode; + int memMode; long long iCpuAff; }; diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 135f26ea9..e82ec91c3 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -78,7 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.device_bfactor = (int)cfg.bfactor; ctx.device_bsleep = (int)cfg.bsleep; ctx.syncMode = cfg.syncMode; - ctx.compMode = cfg.compMode; + ctx.memMode = cfg.memMode; this->affinity = cfg.cpu_aff; std::future numa_guard = numa_promise.get_future(); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index 8167395e3..8fda8d401 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -16,7 +16,7 @@ typedef struct { int device_bfactor; int device_bsleep; int syncMode; - bool compMode; + bool memMode; uint32_t *d_input; uint32_t inputlen; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 3dce3e4ac..00a65332a 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -257,10 +257,10 @@ struct u64 : public uint2 /** cryptonight with two threads per hash * - * @tparam COMP_MODE if true than 64bit memory transfers per thread will be used to store/load data within shared memory - * else 128bit operations will be used + * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory + * else if `1` 256bit operations will be used */ -template +template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 2 ) #endif @@ -334,7 +334,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in { ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; - if(COMP_MODE) + if(MEM_MODE == 0) { #pragma unroll 4 for(int x = 0; x < 8; x += 2) @@ -372,7 +372,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } myChunks[ idx1 + sub ] = cx_aes ^ bx0; - if(COMP_MODE) + if(MEM_MODE == 0) { #pragma unroll 4 for(int x = 0; x < 8; x += 2) @@ -387,7 +387,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in idx1 = (idx0 & 0x30) >> 3; ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; - if(COMP_MODE) + if(MEM_MODE == 0) { #pragma unroll 4 for(int x = 0; x < 8; x += 2) @@ -452,7 +452,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in bx0 = cx_aes; } myChunks[ idx1 + sub ] = ax0; - if(COMP_MODE) + if(MEM_MODE == 0) { #pragma unroll 4 for(int x = 0; x < 8; x += 2) @@ -740,7 +740,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } -template +template void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); @@ -782,7 +782,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2_double<<< + cryptonight_core_gpu_phase2_double<<< grid, block2, sizeof(uint64_t) * block2.x * 8 + @@ -855,42 +855,42 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t if(miner_algo == invalid_algo) return; static const cuda_hash_fn func_table[] = { - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash }; std::bitset<1> digit; - digit.set(0, ctx->compMode); + digit.set(0, ctx->memMode == 1); cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; selected_function(ctx, startNonce); From 5a80f50a629ff487cd557384f39082af56f79532 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 10 Oct 2018 17:41:39 +0200 Subject: [PATCH 73/77] update version to 2.5.0 --- xmrstak/version.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index cd8bb52f5..80c25c6b8 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.4.7" +#define XMR_STAK_VERSION "2.5.0" #if defined(_WIN32) #define OS_TYPE "win" From b1504b36e756269fc47cbf9ad9a959ce3d9ccba7 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 10 Oct 2018 20:51:59 +0200 Subject: [PATCH 74/77] NVIDIA: tweak `get_reciprocal` - remove helper array to perform division - tweak `get_reciprocal` --- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 13 +-- .../nvcc_code/cuda_fast_int_math_v2.hpp | 82 ++++--------------- 2 files changed, 19 insertions(+), 76 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 00a65332a..7742e740e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -271,17 +271,6 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in cn_aes_gpu_init( sharedMemory ); - uint32_t* RCP; - if(ALGO == cryptonight_monero_v8) - { - __shared__ uint32_t RCP_shared[256]; - for (int i = threadIdx.x; i < 256; i += blockDim.x) - { - RCP_shared[i] = RCP_C[i]; - } - RCP = RCP_shared; - } - #if( __CUDA_ARCH__ < 300 ) extern __shared__ uint64_t externShared[]; // 8 x 64bit values @@ -413,7 +402,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ((uint64_t*)myChunks)[ idx1 ] ^= division_result; const uint32_t dd = (static_cast(cx_mul) + (sqrt_result << 1)) | 0x80000001UL; - division_result = fast_div_v2(RCP, cx_aes, dd); + division_result = fast_div_v2(cx_aes, dd); // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(cx_mul + division_result); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index e3220230a..796b7adda 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -2,80 +2,35 @@ #include -static __constant__ const uint32_t RCP_C[256] = +__device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) { - 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu, - 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au, - 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu, - 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u, - 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u, - 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u, - 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u, - 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u, - 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u, - 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u, - 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u, - 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u, - 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au, - 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u, - 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u, - 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu, - 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u, - 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u, - 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u, - 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u, - 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu, - 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u, - 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u, - 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u, - 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u, - 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu, - 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau, - 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u, - 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u, - 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu, - 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u, - 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, -}; + const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23)); + const float a_lo = __uint2float_rn(a & 0xFF); -__device__ __forceinline__ uint32_t get_reciprocal(const uint32_t* RCP, uint32_t a) -{ - const uint32_t index1 = (a & 0x7F000000U) >> 23; - const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768; - - const uint32_t r1 = RCP[index1]; - uint32_t r2_0 = RCP[index1 + 1]; - if (index2 > 0) r2_0 >>= 16; - const int r2 = r2_0 & 0xFFFFU; - - const uint32_t r = r1 - (uint32_t)(__mul24(r2, index2) >> 6); - - const uint64_t lo0 = (uint64_t)(r) * a; - uint64_t lo = lo0 + ((uint64_t)(a) << 32); + float r; + asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi)); + const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23)); - a >>= 1; - const bool b = (a >= lo) || (lo >= lo0); - lo = a - lo; - - const uint64_t k = __umulhi((uint32_t)lo, r) + ((uint64_t)(r) * ((uint32_t*)&lo)[1]) + lo; - return ((uint32_t*)&k)[1] + (b ? r : 0); + const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f)); + return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled); } -__device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, uint32_t b) +__device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) { - const uint32_t r = get_reciprocal(RCP, b); - const uint64_t k = __umulhi((uint32_t)a, r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; + const uint32_t r = get_reciprocal(b); + const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; uint32_t q[2]; q[0] = ((uint32_t*)&k)[1]; - q[1] = (k < a) ? 1 : 0; - const int64_t tmp = a - *((uint64_t*)(q)) * b; - const uint32_t overshoot = (tmp < 0) ? 1u : 0U; - const uint32_t undershoot = (tmp >= b) ? 1u : 0U; + int64_t tmp = a - (uint64_t)(q[0]) * b; + ((int32_t*)(&tmp))[1] -= (k < a) ? b : 0; + + const bool overshoot = ((int32_t*)(&tmp))[1] < 0; + const bool undershoot = tmp >= b; - q[0] += undershoot - overshoot; - q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U); + q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); + q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U); return *((uint64_t*)(q)); } @@ -102,6 +57,5 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0; const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0; result += (overshoot+undershoot); - return result; } From bf882d44a4a341d7dcda7717f095a10f8a954fea Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 10 Oct 2018 21:38:15 +0200 Subject: [PATCH 75/77] update documantion - update tuning guide - update miner usage --- doc/tuning.md | 39 +++++++++++++++++++++++++++++++++------ doc/usage.md | 28 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/doc/tuning.md b/doc/tuning.md index 6bf036e9f..2673d68d9 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -9,6 +9,7 @@ * [AMD Backend](#amd-backend) * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) * [Add more GPUs](#add-more-gpus) + * [Two Threads per GPU](two-threads-per-gpu) * [disable comp_mode](#disable-comp_mode) * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern) * [Increase Memory Pool](#increase-memory-pool) @@ -55,10 +56,10 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`. "gpu_threads_conf" : [ { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - "affine_to_cpu" : false, "sync_mode" : 3, + "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, }, { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - "affine_to_cpu" : false, "sync_mode" : 3, + "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, }, ], ``` @@ -82,11 +83,37 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to ``` "gpu_threads_conf" : [ - { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true + { + "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, - { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true + { + "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + }, +], + +"platform_index" : 0, +``` + +### Two Threads per GPU + +Some GPUs like AMD Vega can mine faster if two threads are using the same GPU. +Use the auto generated config as base and repeat the config entry for a GPU. +If the attribute `index` is used twice than two threads will use one GPU. +Take care that the required memory usage on the GPU will also double. +Therefore adjust your intensity by hand. + +``` +"gpu_threads_conf" : +[ + { + "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + }, + { + "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, ], diff --git a/doc/usage.md b/doc/usage.md index 886c1b319..a371f0e67 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -5,6 +5,7 @@ * [Usage on Windows](#usage-on-windows) * [Usage on Linux](#usage-on-linux) * [Command Line Options](#command-line-options) +* [Use different backends](#use-different-backends) * [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton) ## Configurations @@ -34,6 +35,33 @@ Note: If the pool is ignoring the option `rig_id` in `pools.txt` to name your wo The miner allow to overwrite some of the settings via command line options. Run `xmr-stak --help` to show all available command line options. +## Use Different Backends + +On linux and OSX please add `./` before the binary name `xmr-stak`. + +### CPU Only: +``` +xmr-stak --noAMD --noNVIDIA +``` + +### NVIDIA/AMD Only: + +The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD GPUs) is available. + +``` +xmr-stak --noCPU +``` +**CUDA** is currently not supported. I am currently try to get some performance out it. + +### NVIDIA via OpenCL + +It is possible to use the OpenCl backend which is originally created for AMD GPUs with NVIDIA GPus. +Some NVIDIA GPUs can reach better performance with this backend. + +``` +xmr-stak --openCLVendor NVIDIA --noNVIDIA +``` + ## Docker image usage You can run the Docker image the following way: From 074a9d208b87e2b4a1205c8bb7e04260e14d81b1 Mon Sep 17 00:00:00 2001 From: fireice-uk Date: Thu, 11 Oct 2018 15:16:35 +0100 Subject: [PATCH 76/77] fix logo on Windows --- xmrstak/cli/cli-miner.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index 171e6dec9..40fb9d948 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -790,14 +790,14 @@ int main(int argc, char *argv[]) printer::inst()->print_str("'c' - connection\n"); printer::inst()->print_str("-------------------------------------------------------------------\n"); printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n"); - printer::inst()->print_str(" ##### ______ _____\n"); - printer::inst()->print_str(" ## ## | ___ \\ / __ \\\n"); - printer::inst()->print_str("# _ #| |_/ /_ _ ___ | / \\/ _ _ _ _ _ _ ___ _ __ ___ _ _\n"); - printer::inst()->print_str("# |_| #| /| | | | / _ \\ | | | | | || '_|| '_|/ _ \\| '_ \\ / __|| | | |\n"); - printer::inst()->print_str("# #| |\\ \\| |_| || (_) || \\__/\\| |_| || | | | | __/| | | || (__ | |_| |\n"); - printer::inst()->print_str(" ## ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_| |_| \\___||_| |_| \\___| \\__, |\n"); - printer::inst()->print_str(" ##### __/ | __/ |\n"); - printer::inst()->print_str(" |___/ https://ryo-currency.com |___/\n\n"); + printer::inst()->print_str(" ##### ______ ____\n"); + printer::inst()->print_str(" ## ## | ___ \\ / _ \\\n"); + printer::inst()->print_str("# _ #| |_/ /_ _ ___ | / \\/ _ _ _ _ _ _ ___ _ __ ___ _ _\n"); + printer::inst()->print_str("# |_| #| /| | | | / _ \\ | | | | | || '_|| '_|/ _ \\| '_ \\ / __|| | | |\n"); + printer::inst()->print_str("# #| |\\ \\| |_| || (_) || \\_/\\| |_| || | | | | __/| | | || (__ | |_| |\n"); + printer::inst()->print_str(" ## ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_| |_| \\___||_| |_| \\___| \\__, |\n"); + printer::inst()->print_str(" ##### __/ | __/ |\n"); + printer::inst()->print_str(" |___/ https://ryo-currency.com |___/\n\n"); printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n"); printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n"); printer::inst()->print_str("-------------------------------------------------------------------\n"); From 732b0e4115cf882d5c17479d36b9b37fa8fcdce1 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Thu, 11 Oct 2018 10:30:51 +0200 Subject: [PATCH 77/77] NVIDIA: support for multiple CUDA libs Allow to ship the miner with multiple cuda backends those depends on different driver versions. This will allow to support Turing/Volta and old Fermi GPU within one release. - add support to search for the first working CUDA backend - add some more messages to support better debugging (if a user has some issues) --- xmrstak/backend/backendConnector.cpp | 46 +++++++++++++++++++++++----- xmrstak/backend/nvidia/minethd.cpp | 4 +++ xmrstak/backend/plugin.hpp | 31 +++++++++++-------- 3 files changed, 62 insertions(+), 19 deletions(-) diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp index 525413fd5..92bb01506 100644 --- a/xmrstak/backend/backendConnector.cpp +++ b/xmrstak/backend/backendConnector.cpp @@ -63,10 +63,35 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork) #ifndef CONF_NO_CUDA if(params::inst().useNVIDIA) { - plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend"); - std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); - pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); - if(nvidiaThreads->size() == 0) + plugin nvidiaplugin; + std::vector* nvidiaThreads; + std::vector libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"}; + size_t numWorkers = 0u; + + for( const auto & name : libNames) + { + printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str()); + nvidiaplugin.load("NVIDIA", name); + std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); + if(nvidiaThreads != nullptr) + { + pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); + numWorkers = nvidiaThreads->size(); + delete nvidiaThreads; + } + else + { + // remove the plugin if we have found no GPUs + nvidiaplugin.unload(); + } + // we found at leat one working GPU + if(numWorkers != 0) + { + printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str()); + break; + } + } + if(numWorkers == 0) printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled."); } #endif @@ -75,10 +100,17 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork) if(params::inst().useAMD) { const std::string backendName = xmrstak::params::inst().openCLVendor; - plugin amdplugin(backendName, "xmrstak_opencl_backend"); + plugin amdplugin; + amdplugin.load(backendName, "xmrstak_opencl_backend"); std::vector* amdThreads = amdplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); - pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads)); - if(amdThreads->size() == 0) + size_t numWorkers = 0u; + if(amdThreads != nullptr) + { + pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads)); + numWorkers = amdThreads->size(); + delete amdThreads; + } + if(numWorkers == 0) printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str()); } #endif diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index e82ec91c3..6460628de 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -165,6 +165,10 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor std::cout<<"WARNING: NVIDIA no device found"<GetGPUThreadCount(); pvThreads->reserve(n); diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp index 1811af224..5c7dfe16a 100644 --- a/xmrstak/backend/plugin.hpp +++ b/xmrstak/backend/plugin.hpp @@ -27,8 +27,11 @@ namespace xmrstak struct plugin { - plugin(const std::string backendName, const std::string libName) : fn_startBackend(nullptr), m_backendName(backendName) + plugin() = default; + + void load(const std::string backendName, const std::string libName) { + m_backendName = backendName; #ifdef WIN32 libBackend = LoadLibrary(TEXT((libName + ".dll").c_str())); if(!libBackend) @@ -81,32 +84,36 @@ struct plugin if(fn_startBackend == nullptr) { std::vector* pvThreads = new std::vector(); - std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl; return pvThreads; } return fn_startBackend(threadOffset, pWork, env); } + void unload() + { + if(libBackend) + { +#ifdef WIN32 + FreeLibrary(libBackend); +#else + dlclose(libBackend); +#endif + } + fn_startBackend = nullptr; + } + std::string m_backendName; typedef std::vector* (*startBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env); - startBackend_t fn_startBackend; + startBackend_t fn_startBackend = nullptr; #ifdef WIN32 HINSTANCE libBackend; #else - void *libBackend; -#endif - -/* \todo add unload to destructor and change usage of plugin that libs kept open until the miner ends -#ifdef WIN32 - FreeLibrary(libBackend); -#else - dlclose(libBackend); + void *libBackend = nullptr; #endif - * */ }; } // namespace xmrstak