From 24c2ae1128c4f494434a1190397e8d5f84d9f5ce Mon Sep 17 00:00:00 2001
From: Kundrata <ffkundrata@gmail.com>
Date: Mon, 12 Mar 2018 20:48:35 +0200
Subject: [PATCH 01/77] Correct api.json information

An attempt to fix issue 1084: https://github.com/fireice-uk/xmr-stak/issues/1084
---
 xmrstak/misc/executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index c4ba26ead..473612cec 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -1223,7 +1223,7 @@ void executor::http_json_report(std::string& out)
 		if(i != 0) cn_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiConnectionError,
-			int_port(duration_cast<seconds>(vMineResults[i].time.time_since_epoch()).count()),
+			int_port(duration_cast<seconds>(vSocketLog[i].time.time_since_epoch()).count()),
 			vSocketLog[i].msg.c_str());
 		cn_error.append(buffer);
 	}

From 038c88efbcc75d29f62d6f32bc2d36269a867a4c Mon Sep 17 00:00:00 2001
From: maurezen <maurezen@users.noreply.github.com>
Date: Tue, 5 Jun 2018 08:40:34 +0300
Subject: [PATCH 02/77] Update compile_Linux.md

RAM requirement mention in compile_linux
---
 doc/compile_Linux.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index 072402ff7..2e45b21dd 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -105,6 +105,8 @@ In that case you can force CUDA to use an older compiler in the following way:
 cmake -DCUDA_HOST_COMPILER=/usr/bin/gcc-5 ..
 ```
 
+- You need 1 Gb RAM to compile (a bit less might be enough, 512 Mb isn't). 
+
 ### To do a generic and static build for a system without gcc 5.1+
 ```
     cmake -DCMAKE_LINK_STATIC=ON -DXMR-STAK_COMPILE=generic .

From 145ac6ffb53d5b4ec11b464523c363f8302a8daf Mon Sep 17 00:00:00 2001
From: maurezen <maurezen@users.noreply.github.com>
Date: Tue, 5 Jun 2018 09:16:06 +0300
Subject: [PATCH 03/77] Update FAQ.md

Internal compiler error on low-RAM machines
---
 doc/FAQ.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/FAQ.md b/doc/FAQ.md
index aa6fb8959..a8adf064f 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -11,6 +11,7 @@
 * [How can I mine Monero](#how-can-i-mine-monero)
 * [Why is Monero named monero7](#why-is-monero-named-monero7)
 * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
+* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error)
 
 ## "Obtaining SeLockMemoryPrivilege failed."
 
@@ -87,3 +88,7 @@ To avoid configuration conflicts after the hard fork of Monero to the new POW wi
 
 If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm.
 Select one of these generic coin algorithms.
+
+## Internal compiler error
+
+Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04).

From bd81795eeedaafe2fa7752b9919cd8149dd2d408 Mon Sep 17 00:00:00 2001
From: Juan Leni <lenijuan@gmail.com>
Date: Sun, 6 May 2018 21:11:30 +0200
Subject: [PATCH 04/77] adding qrl as a config option

---
 README.md         | 1 +
 xmrstak/jconf.cpp | 1 +
 xmrstak/pools.tpl | 1 +
 3 files changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 887bc5cf3..ff4b8d3ad 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Haven](https://havenprotocol.com)
 - [Intense](https://intensecoin.com)
 - [Masari](https://getmasari.org)
+- [QRL](https://theqrl.org)
 - [Ryo](https://ryo-currency.com)
 - [TurtleCoin](https://turtlecoin.lol)
 
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 354388849..b6580ea9a 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -105,6 +105,7 @@ xmrstak::coin_selection coins[] = {
 	{ "intense",             {cryptonight_monero, cryptonight, 4u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "masari",              {cryptonight_masari, cryptonight_monero, 7u},   {cryptonight_monero, cryptonight_monero, 0u},nullptr },
 	{ "monero7",             {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	 {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "stellite",            {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "turtlecoin",          {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr }
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 6960d63bb..78f2315ac 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -28,6 +28,7 @@ POOLCONF],
  *    intense
  *    masari
  *    monero7 (use this for Monero's new PoW)
+ *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin
  *

From 0e1193d93b09c55b7215ec35dfb4b226cd7bf4bd Mon Sep 17 00:00:00 2001
From: Cheran <cheran.v.senthil@gmail.com>
Date: Wed, 25 Jul 2018 12:04:10 +0530
Subject: [PATCH 05/77] Fix Spelling

changed chose to choose
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 887bc5cf3..ec2a22b20 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Ryo](https://ryo-currency.com)
 - [TurtleCoin](https://turtlecoin.lol)
 
-If your prefered coin is not listed, you can chose one of the following algorithms:
+If your prefered coin is not listed, you can choose one of the following algorithms:
 
 - 1MiB scratchpad memory
     - cryptonight_lite

From cbe03f7e6f2806f15ac726d261ccf38b8304a787 Mon Sep 17 00:00:00 2001
From: Cheran <cheran.v.senthil@gmail.com>
Date: Sun, 29 Jul 2018 17:14:48 +0530
Subject: [PATCH 06/77] Fix Grammar

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 327c4e9f8..e3b01328a 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ###### fireice-uk's and psychocrypt's
 # XMR-Stak - Cryptonight All-in-One Mining Software
 
-XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus and can be used to mine the crypto currencys Monero, Aeon and many more Cryptonight coins.
+XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
 
 ## HTML reports
 <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260">
@@ -28,7 +28,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
 - supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON)
 - easy to use
   - guided start (no need to edit a config file for the first start)
-  - auto configuration for each backend
+  - auto-configuration for each backend
 - open source software (GPLv3)
 - TLS support
 - [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton)
@@ -64,7 +64,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_haven
     - cryptonight_heavy
 
-Please note, this list is not complete, and is not an endorsement.
+Please note, this list is not complete and is not an endorsement.
 
 ## Download
 
@@ -72,7 +72,7 @@ You can find the latest releases and precompiled binaries on GitHub under [Relea
 
 ## Default Developer Donation
 
-By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
+By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
 
 If you want to donate directly to support further development, here is my wallet
 

From 63384f4cf6915bfe98047686b622e93c8d0e6b58 Mon Sep 17 00:00:00 2001
From: JokerGermany <30293477+JokerGermany@users.noreply.github.com>
Date: Fri, 3 Aug 2018 20:06:41 +0200
Subject: [PATCH 07/77] Download Link for AMD APP SDK 3.0 fixed

---
 doc/compile_Windows.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 802d5c5ab..add5fbfd0 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -32,8 +32,8 @@
 
 ### AMD APP SDK 3.0 (only needed for AMD GPUs)
 
-- Download and install the latest version from https://www.dropbox.com/s/gq8vqhelq0m6gj4/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
-  (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
+- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
+  (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
 
 ### Dependencies OpenSSL/Hwloc and Microhttpd
 - For CUDA 8*:

From 43fa697cfa8b63d1fbc63c363b37252141fd84b6 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Wed, 8 Aug 2018 06:45:52 -0600
Subject: [PATCH 08/77] Add detail to CUDA detections, better for issue reports

---
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 3b049ace8..b455f55ca 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -450,19 +450,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 	if(version < CUDART_VERSION)
 	{
-		printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+		printf("WARNING: Driver supports CUDA %d.%d but this was compiled for CUDA %d.%d API! Update your nVidia driver or compile with older CUDA!\n",
+			version / 1000, (version % 1000 / 10),
+			CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
 		return 1;
 	}
 
 	int GPU_N;
 	if(cuda_get_devicecount(&GPU_N) == 0)
 	{
+		printf("WARNING: CUDA claims zero devices?\n");
 		return 1;
 	}
 
 	if(ctx->device_id >= GPU_N)
 	{
-		printf("Invalid device ID!\n");
+		printf("WARNING: Invalid device ID '%i'!\n", ctx->device_id);
 		return 1;
 	}
 
@@ -483,6 +486,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 	ctx->name = std::string(props.name);
 
+	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ",
+		version / 1000, (version % 1000 / 10),
+		CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10,
+		ctx->device_id, gpuArch, ctx->device_name);
+
 	std::vector<int> arch;
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
@@ -496,13 +504,14 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	while ( ss >> tmpArch )
 		arch.push_back( tmpArch );
 
+	#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
 	if(gpuArch >= 20 && gpuArch < 30)
 	{
 		// compiled binary must support sm_20 for fermi
 		std::vector<int>::iterator it = std::find(arch.begin(), arch.end(), 20);
 		if(it == arch.end())
 		{
-			printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch);
+			printf(MSG_CUDA_NO_ARCH);
 			return 5;
 		}
 	}
@@ -520,7 +529,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 				minSupportedArch = arch[i];
 		if(minSupportedArch < 30 || gpuArch < minSupportedArch)
 		{
-			printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch);
+			printf(MSG_CUDA_NO_ARCH);
 			return 5;
 		}
 	}
@@ -529,8 +538,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	if(ctx->device_blocks == -1)
 	{
 		/* good values based of my experience
-		 *	 - 3 * SMX count >=sm_30
-		 *   - 2 * SMX count for <sm_30
+		 *   - 3 * SMX count for >=sm_30
+		 *   - 2 * SMX count for  <sm_30
 		 */
 		ctx->device_blocks = props.multiProcessorCount *
 			( props.major < 3 ? 2 : 3 );
@@ -582,18 +591,19 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 		int* tmp;
 		cudaError_t err;
+		#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
 		// a device must be selected to get the right memory usage later on
 		err = cudaSetDevice(ctx->device_id);
 		if(err != cudaSuccess)
 		{
-			printf("WARNING: NVIDIA GPU %d: cannot be selected.\n", ctx->device_id);
+			printf(MSG_CUDA_FUNC_FAIL, "cudaSetDevice");
 			return 2;
 		}
 		// trigger that a context on the gpu will be allocated
 		err = cudaMalloc(&tmp, 256);
 		if(err != cudaSuccess)
 		{
-			printf("WARNING: NVIDIA GPU %d: context cannot be created.\n", ctx->device_id);
+			printf(MSG_CUDA_FUNC_FAIL, "cudaMalloc");
 			return 3;
 		}
 
@@ -626,9 +636,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t usedMem = totalMemory - freeMemory;
 		if(usedMem >= maxMemUsage)
 		{
-			printf("WARNING: NVIDIA GPU %d: already %s MiB memory in use, skip GPU.\n",
-				ctx->device_id,
-				std::to_string(usedMem/byteToMiB).c_str());
+			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str());
 			return 4;
 		}
 		else
@@ -661,6 +669,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 
 	}
+	printf("device init succeeded\n");
 
 	return 0;
 }

From 1eb199a53c3ef796da56ea259251c0b91908764d Mon Sep 17 00:00:00 2001
From: jefferson-1 <jefferson@jeffersonnunnconsulting.com>
Date: Sat, 18 Aug 2018 21:55:06 -0500
Subject: [PATCH 09/77] Incorrect Grammar fix

Improve the grammar of the Dev donation setting.
---
 xmrstak/donate-level.hpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/xmrstak/donate-level.hpp b/xmrstak/donate-level.hpp
index 71b79628a..0f851f8f8 100644
--- a/xmrstak/donate-level.hpp
+++ b/xmrstak/donate-level.hpp
@@ -1,18 +1,19 @@
 #pragma once
 
 /*
- * Dev donation.
- * Percentage of your hashing power that you want to donate to the developer, can be 0.0 if you don't want to do that.
- * Example of how it works for the default setting of 2.0:
- * You miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minute.
- * Switching is instant, and only happens after a successful connection, so you never loose any hashes.
+ * DEV DONATION SETTING
+ * This setting is a percentage of your hashing power that the miner donates to the developers of this app.
+ * It can be 0.0 if you don't want to help the developers. The default setting of 2.0 means that
+ * the miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minutes.
+ * Switching pools is instant and it only happens after a successful connection, so you don't lose any hash time.
  *
- * If you plan on changing this setting to 0.0 please consider making a one off donation to our wallets:
+ * If you plan on changing this setting to 0.0, please consider making a one time donation to our wallets:
  * fireice-uk:
  * 4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk
  * psychocrypt:
  * 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU
  *
+ * Thank you for your support.
  */
 
 constexpr double fDevDonationLevel = 2.0 / 100.0;

From 69628078c50727e374f425d846a7e0b7997d0405 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Tue, 28 Aug 2018 20:29:57 +0200
Subject: [PATCH 10/77] add self test hashes

The most algorithm currently are not checked in the cpu self test function.

- add hash for each algorithm
---
 xmrstak/backend/cpu/minethd.cpp | 226 +++++++++++++++++++++-----------
 1 file changed, 149 insertions(+), 77 deletions(-)

diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 2e7169ef7..a8452ebb1 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -232,92 +232,164 @@ bool minethd::self_test()
 
 	bool bResult = true;
 
-	if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight)
-	{
-		unsigned char out[32 * MAX_N];
-		cn_hash_fun hashf;
-		cn_hash_fun_multi hashf_multi;
-
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf("This is a test", 14, out, ctx[0]);
-		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-		hashf("This is a test", 14, out, ctx[0]);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
-		hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
-		bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-				"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
-		hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-		hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
-		bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-				"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
-		hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
-
-		hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
-
-		hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_aeon)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_ipbc)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_stellite)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_masari)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_bittube2)
+	unsigned char out[32 * MAX_N];
+	cn_hash_fun hashf;
+	cn_hash_fun_multi hashf_multi;
+
+	xmrstak_algo algo = xmrstak_algo::invalid_algo;
+
+	for(int algo_idx = 0; algo_idx < 2; ++algo_idx)
 	{
-		unsigned char out[32 * MAX_N];
-		cn_hash_fun hashf;
+		if(algo_idx == 0)
+			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo();
+		else
+			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
 
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+		if(algo == cryptonight)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf("This is a test", 14, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf("This is a test", 14, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+			hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+			hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+			hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+
+			hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+
+			hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+		}
+		else if(algo == cryptonight_lite)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 
-		hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]);
-		bResult = memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
+			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+		}
+		else if(algo == cryptonight_monero)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 
-		hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]);
-		bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+		}
+		else if(algo == cryptonight_aeon)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 
-		hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]);
-		bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+		}
+		else if(algo == cryptonight_ipbc)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
+		}
+		else if(algo == cryptonight_stellite)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+		}
+		else if(algo == cryptonight_masari)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+		}
+		else if(algo == cryptonight_heavy)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+		}
+		else if(algo == cryptonight_haven)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
+			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+		}
+		else if(algo == cryptonight_bittube2)
+		{
+			unsigned char out[32 * MAX_N];
+			cn_hash_fun hashf;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+
+			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]);
+			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+
+			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]);
+			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
+
+			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]);
+			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+		}
+
+		if(!bResult)
+			printer::inst()->print_msg(L0,
+				"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
 	}
+
 	for (int i = 0; i < MAX_N; i++)
 		cryptonight_free_ctx(ctx[i]);
 
-	if(!bResult)
-		printer::inst()->print_msg(L0,
-			"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
-
 	return bResult;
 }
 

From e7c8382708779a0447ec3e0541512b515b5bea33 Mon Sep 17 00:00:00 2001
From: Piotr Chromiec <tworec@golem.network>
Date: Thu, 13 Sep 2018 13:52:49 +0200
Subject: [PATCH 11/77] AMD APP SDK 3.0 url fix

dropbox link is broken
---
 doc/compile_Linux.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index 072402ff7..fce52b64c 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -4,7 +4,7 @@
 
 ### AMD APP SDK 3.0 (only needed to use AMD GPUs)
 
-- download and install the latest version from https://www.dropbox.com/sh/mpg882ekirnsfa7/AADWz5X-TgVdsmWt0QwMgTWLa/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2?dl=0
+- download and install the latest version from http://debian.nullivex.com/amd/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2 (see https://github.com/fireice-uk/xmr-stak/issues/1511#issuecomment-385120692)
   (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/228059)
 
 ### Cuda 8.0+ (only needed to use NVIDIA GPUs)

From 931bd5fef17f908afc62836ae7b6ea087d1441ca Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 10 Sep 2018 16:49:59 +0200
Subject: [PATCH 12/77] unify cpu cryptonight implementations

xmr-stak has several implementations for multi hash per thread.
The results into 3 intepedent implementations.
Each time the algorithm must be changed the possibility to introduce errors is very large.

- unify the different cryptonight CPU implementations
- simplify the function selection array to find the specilized cryptonight implementation
- add a intermediat pointer to access the large state (similar to the old multi hash implementation)

As side effect this change increases the speed of the single and multi hash algorithm.
---
 xmrstak/backend/amd/minethd.cpp               |   2 +-
 xmrstak/backend/amd/minethd.hpp               |   2 +-
 .../backend/cpu/crypto/cryptonight_aesni.h    | 947 ++++++------------
 xmrstak/backend/cpu/minethd.cpp               | 478 ++-------
 xmrstak/backend/cpu/minethd.hpp               |   7 +-
 xmrstak/backend/nvidia/minethd.cpp            |   2 +-
 xmrstak/backend/nvidia/minethd.hpp            |   2 +-
 7 files changed, 373 insertions(+), 1067 deletions(-)

diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index f7b47249e..d6051ffcd 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -252,7 +252,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 3142117c5..04c2ff8ad 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -24,7 +24,7 @@ class minethd  : public iBackend
 	static bool init_gpus();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
 
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 9f70bcfa7..89c508990 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -151,15 +151,15 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
 
 inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
 {
-    __m128i tmp0 = x0;
-    x0 = _mm_xor_si128(x0, x1);
-    x1 = _mm_xor_si128(x1, x2);
-    x2 = _mm_xor_si128(x2, x3);
-    x3 = _mm_xor_si128(x3, x4);
-    x4 = _mm_xor_si128(x4, x5);
-    x5 = _mm_xor_si128(x5, x6);
-    x6 = _mm_xor_si128(x6, x7);
-    x7 = _mm_xor_si128(x7, tmp0);
+	__m128i tmp0 = x0;
+	x0 = _mm_xor_si128(x0, x1);
+	x1 = _mm_xor_si128(x1, x2);
+	x2 = _mm_xor_si128(x2, x3);
+	x3 = _mm_xor_si128(x3, x4);
+	x4 = _mm_xor_si128(x4, x5);
+	x5 = _mm_xor_si128(x5, x6);
+	x6 = _mm_xor_si128(x6, x7);
+	x7 = _mm_xor_si128(x7, tmp0);
 }
 
 template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
@@ -467,712 +467,325 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 }
 
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
-{
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
-
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
-	{
-		memset(output, 0, 32);
-		return;
-	}
-
-	keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
-
-	uint64_t monero_const;
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-	{
-		monero_const  =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35);
-		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx0->hash_state) + 24);
+#define CN_INIT_SINGLE \
+	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
+	{ \
+		memset(output, 0, 32 * N); \
+		return; \
 	}
 
-	// Optim - 99% time boundary
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
-
-	uint8_t* l0 = ctx0->long_state;
-	uint64_t* h0 = (uint64_t*)ctx0->hash_state;
-
-	uint64_t al0 = h0[0] ^ h0[4];
-	uint64_t ah0 = h0[1] ^ h0[5];
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
-	uint64_t idx0 = h0[0] ^ h0[4];
-
-	// Optim - 90% time boundary
-	for(size_t i = 0; i < ITERATIONS; i++)
-	{
-		__m128i cx;
-		cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
-
-		if (ALGO == cryptonight_bittube2)
-		{
-			cx = aes_round_bittube2(cx, _mm_set_epi64x(ah0, al0));
-		} 
-		else
-		{
-			if(SOFT_AES)
-				cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
-			else
-				cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
-		}
-
-		if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-			cryptonight_monero_tweak<ALGO>((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-		else
-			_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-
-		idx0 = _mm_cvtsi128_si64(cx);
-
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
-		bx0 = cx;
-
-		uint64_t hi, lo, cl, ch;
-		cl = ((uint64_t*)&l0[idx0 & MASK])[0];
-		ch = ((uint64_t*)&l0[idx0 & MASK])[1];
-
-		lo = _umul128(idx0, cl, &hi);
-
-		al0 += hi;
-		((uint64_t*)&l0[idx0 & MASK])[0] = al0;
-		al0 ^= cl;
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0);
-		ah0 += lo;
-
-		if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) {
-			if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
-				((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const ^ ((uint64_t*)&l0[idx0 & MASK])[0];
-			else
-				((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const;
-		}
-		else
-			((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
-		ah0 ^= ch;
-
-		idx0 = al0;
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
+	uint64_t monero_const; \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{ \
+		monero_const =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35); \
+		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
+	} \
+	/* Optim - 99% time boundary */ \
+	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \
+	\
+	__m128i ax0; \
+	uint64_t idx0; \
+	__m128i bx0; \
+	uint8_t* l0 = ctx[n]->long_state; \
+	{ \
+		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
+		idx0 = h0[0] ^ h0[4]; \
+		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
+		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
+	} \
+	__m128i *ptr0
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
-		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
 
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = d ^ q;
-		}
-		else if(ALGO == cryptonight_haven)
-		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
-
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = (~d) ^ q;
-		}
+#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+	__m128i cx; \
+	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+	cx = _mm_load_si128(ptr0); \
+	if (ALGO == cryptonight_bittube2) \
+	{ \
+		cx = aes_round_bittube2(cx, ax0); \
+	} \
+	else \
+	{ \
+		if(SOFT_AES) \
+			cx = soft_aesenc(cx, ax0); \
+		else \
+			cx = _mm_aesenc_si128(cx, ax0); \
 	}
 
-	// Optim - 90% time boundary
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
-
-	// Optim - 99% time boundary
-
-	keccakf((uint64_t*)ctx0->hash_state, 24);
-	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
-}
-
-// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
-// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
-// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
-{
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
-
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
-	{
-		memset(output, 0, 64);
-		return;
-	}
+#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \
+	else \
+		_mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \
+	idx0 = _mm_cvtsi128_si64(cx); \
+	\
+	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+	if(PREFETCH) \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
+	bx0 = cx; \
+
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+	uint64_t lo, cl, ch; \
+	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
+	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
+	cl = ((uint64_t*)ptr0)[0]; \
+	ch = ((uint64_t*)ptr0)[1]; \
+	\
+	{ \
+		uint64_t hi; \
+		lo = _umul128(idx0, cl, &hi); \
+		ah0 += lo; \
+		al0 += hi; \
+	} \
+	((uint64_t*)ptr0)[0] = al0; \
+	if(PREFETCH) \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
+	
 
-	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-	keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
+#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{ \
+		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \
+		else \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \
+	} \
+	else \
+		((uint64_t*)ptr0)[1] = ah0; \
+	al0 ^= cl; \
+	ah0 ^= ch; \
+	ax0 = _mm_set_epi64x(ah0, al0); \
+	idx0 = al0;
 
-	uint64_t monero_const_0, monero_const_1;
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-	{
-		monero_const_0  =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35);
-		monero_const_0 ^=  *(reinterpret_cast<const uint64_t*>(ctx[0]->hash_state) + 24);
-		monero_const_1  =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len + 35);
-		monero_const_1 ^=  *(reinterpret_cast<const uint64_t*>(ctx[1]->hash_state) + 24);
+#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
+	{ \
+		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+		int64_t u  = ((int64_t*)ptr0)[0]; \
+		int32_t d  = ((int32_t*)ptr0)[2]; \
+		int64_t q = u / (d | 0x5); \
+		\
+		((int64_t*)ptr0)[0] = u ^ q; \
+		idx0 = d ^ q; \
+	} \
+	else if(ALGO == cryptonight_haven) \
+	{ \
+		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+		int64_t u  = ((int64_t*)ptr0)[0]; \
+		int32_t d  = ((int32_t*)ptr0)[2]; \
+		int64_t q = u / (d | 0x5); \
+		\
+		((int64_t*)ptr0)[0] = u ^ q; \
+		idx0 = (~d) ^ q; \
 	}
 
-	// Optim - 99% time boundary
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+#define CN_FINALIZE(n) \
+	/* Optim - 90% time boundary */ \
+	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \
+	/* Optim - 99% time boundary */ \
+	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
+	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
 
-	uint64_t axl0 = h0[0] ^ h0[4];
-	uint64_t axh0 = h0[1] ^ h0[5];
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	uint64_t axl1 = h1[0] ^ h1[4];
-	uint64_t axh1 = h1[1] ^ h1[5];
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-	uint64_t idx0 = h0[0] ^ h0[4];
-	uint64_t idx1 = h1[0] ^ h1[4];
+//! defer the evaluation of an macro
+#ifndef _MSC_VER
+#	define CN_DEFER(...) __VA_ARGS__
+#else
+#	define CN_EMPTY(...)
+#	define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
+#endif
 
-	// Optim - 90% time boundary
-	for (size_t i = 0; i < ITERATIONS; i++)
+//! execute the macro f with the passed arguments
+#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__)
+
+/** add append n to all arguments and keeps n as first argument
+ *
+ * @param n number which is appended to the arguments (expect the first argument n)
+ * 
+ * @code{.cpp}
+ * CN_ENUM_2(1, foo, bar)
+ * // is transformed to
+ * 1, foo1, bar1
+ * @endcode
+ */
+#define CN_ENUM_0(n, ...) n
+#define CN_ENUM_1(n, x1) n, x1 ## n
+#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n
+#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n
+#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n
+#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n
+#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n
+#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n
+#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n
+#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n
+#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
+#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
+#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
+
+/** repeat a macro call multiple times
+ *
+ * @param n number of arguments followed after f
+ * @param f name of the macro which should be executed
+ * @param ... n parameter which name will get appended by a unique number
+ *
+ * @code{.cpp}
+ * REPEAT_2(2, f, foo, bar)
+ * // is transformed to
+ * f(0, foo0, bar); f(1, foo1, bar1)
+ * @endcode
+ */
+#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__))
+#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__))
+#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__))
+#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__))
+#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__))
+
+template< size_t N>
+struct Cryptonight_hash;
+
+template< >
+struct Cryptonight_hash<1>
+{
+	static constexpr size_t N = 1;
+	
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		__m128i cx;
-		cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
-
-		if (ALGO == cryptonight_bittube2)
-		{
-			cx = aes_round_bittube2(cx, _mm_set_epi64x(axh0, axl0));
-		} 
-		else
-		{
-			if(SOFT_AES)
-				cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0));
-			else
-				cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
-		}
-
-		if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-			cryptonight_monero_tweak<ALGO>((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-		else
-			_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-
-		idx0 = _mm_cvtsi128_si64(cx);
-		bx0 = cx;
-
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
-
-		cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]);
-
-		if (ALGO == cryptonight_bittube2)
-		{
-			cx = aes_round_bittube2(cx, _mm_set_epi64x(axh1, axl1));
-		} 
-		else
-		{
-			if(SOFT_AES)
-				cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1));
-			else
-				cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
-		}
-
-		if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-			cryptonight_monero_tweak<ALGO>((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
-		else
-			_mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
-
-		idx1 = _mm_cvtsi128_si64(cx);
-		bx1 = cx;
-
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
-
-		uint64_t hi, lo, cl, ch;
-		cl = ((uint64_t*)&l0[idx0 & MASK])[0];
-		ch = ((uint64_t*)&l0[idx0 & MASK])[1];
-
-		lo = _umul128(idx0, cl, &hi);
-
-		axl0 += hi;
-		axh0 += lo;
-		((uint64_t*)&l0[idx0 & MASK])[0] = axl0;
-
-		if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) {
-			if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
-				((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0 ^ ((uint64_t*)&l0[idx0 & MASK])[0];
-			else
-				((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0;
-		} else
-			((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
-
-		axh0 ^= ch;
-		axl0 ^= cl;
-		idx0 = axl0;
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
-		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
+		CN_INIT_SINGLE;
+		REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
 
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = d ^ q;
-		}
-		else if(ALGO == cryptonight_haven)
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
 
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = (~d) ^ q;
+			REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
-
-		cl = ((uint64_t*)&l1[idx1 & MASK])[0];
-		ch = ((uint64_t*)&l1[idx1 & MASK])[1];
-
-		lo = _umul128(idx1, cl, &hi);
-
-		axl1 += hi;
-		axh1 += lo;
-		((uint64_t*)&l1[idx1 & MASK])[0] = axl1;
+		REPEAT_1(0, CN_FINALIZE);
+	}
+};
 
-		if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) {
-			if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
-				((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1 ^ ((uint64_t*)&l1[idx1 & MASK])[0];
-			else
-				((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1;
-		} else
-			((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+template< >
+struct Cryptonight_hash<2>
+{
+	static constexpr size_t N = 2;
 
-		axh1 ^= ch;
-		axl1 ^= cl;
-		idx1 = axl1;
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
-		{
-			int64_t n  = ((int64_t*)&l1[idx1 & MASK])[0];
-			int32_t d  = ((int32_t*)&l1[idx1 & MASK])[2];
-			int64_t q = n / (d | 0x5);
+		CN_INIT_SINGLE;
+		REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
 
-			((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-			idx1 = d ^ q;
-		}
-		else if(ALGO == cryptonight_haven)
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			int64_t n  = ((int64_t*)&l1[idx1 & MASK])[0];
-			int32_t d  = ((int32_t*)&l1[idx1 & MASK])[2];
-			int64_t q = n / (d | 0x5);
-
-			((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-			idx1 = (~d) ^ q;
+			REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
+		REPEAT_2(0, CN_FINALIZE);
 	}
+};
 
-	// Optim - 90% time boundary
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
-
-	// Optim - 99% time boundary
-
-	keccakf((uint64_t*)ctx[0]->hash_state, 24);
-	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
-	keccakf((uint64_t*)ctx[1]->hash_state, 24);
-	extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
-}
-
-#define CN_STEP1(a, b, c, l, ptr, idx)				\
-	ptr = (__m128i *)&l[idx & MASK];			\
-	if(PREFETCH)						\
-		_mm_prefetch((const char*)ptr, _MM_HINT_T0);	\
-	c = _mm_load_si128(ptr);
-
-#define CN_STEP2(a, b, c, l, ptr, idx)				\
-	if (ALGO == cryptonight_bittube2)	\
-	{	\
-		c = aes_round_bittube2(c, a);	\
-	}	\
-	else	\
-	{	\
-		if(SOFT_AES)					\
-			c = soft_aesenc(c, a);			\
-		else						\
-			c = _mm_aesenc_si128(c, a);		\
-	} 							\
-	b = _mm_xor_si128(b, c);				\
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr, b); \
-	else \
-		_mm_store_si128(ptr, b);\
-
-#define CN_STEP3(a, b, c, l, ptr, idx)				\
-	idx = _mm_cvtsi128_si64(c);				\
-	ptr = (__m128i *)&l[idx & MASK];			\
-	if(PREFETCH)						\
-		_mm_prefetch((const char*)ptr, _MM_HINT_T0);	\
-	b = _mm_load_si128(ptr);
-
-#define CN_STEP4(a, b, c, l, mc, ptr, idx)				\
-	lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi);		\
-	a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi));		\
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-	{ \
-		_mm_store_si128(ptr, _mm_xor_si128(a, mc)); \
-		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
-			((uint64_t*)ptr)[1] ^= ((uint64_t*)ptr)[0];\
-	} \
-	else \
-		_mm_store_si128(ptr, a);\
-	a = _mm_xor_si128(a, b); \
-	idx = _mm_cvtsi128_si64(a);	\
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
-	{ \
-		int64_t n  = ((int64_t*)&l[idx & MASK])[0]; \
-		int32_t d  = ((int32_t*)&l[idx & MASK])[2]; \
-		int64_t q = n / (d | 0x5); \
-		((int64_t*)&l[idx & MASK])[0] = n ^ q; \
-		idx = d ^ q; \
-	} \
-	else if(ALGO == cryptonight_haven) \
-	{ \
-		int64_t n  = ((int64_t*)&l[idx & MASK])[0]; \
-		int32_t d  = ((int32_t*)&l[idx & MASK])[2]; \
-		int64_t q = n / (d | 0x5); \
-		((int64_t*)&l[idx & MASK])[0] = n ^ q; \
-		idx = (~d) ^ q; \
-	}
-
-#define CONST_INIT(ctx, n) \
-	__m128i mc##n = _mm_set_epi64x(*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + n * len + 35) ^ \
-	*(reinterpret_cast<const uint64_t*>((ctx)->hash_state) + 24), 0);
-
-// This lovelier creation will do 3 cn hashes at a time.
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template< >
+struct Cryptonight_hash<3>
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 3;
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		memset(output, 0, 32 * 3);
-		return;
-	}
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	for (size_t i = 0; i < 3; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+		CN_INIT_SINGLE;
+		REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
 
-	CONST_INIT(ctx[0], 0);
-	CONST_INIT(ctx[1], 1);
-	CONST_INIT(ctx[2], 2);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-
-	uint64_t idx0, idx1, idx2;
-	idx0 = _mm_cvtsi128_si64(ax0);
-	idx1 = _mm_cvtsi128_si64(ax1);
-	idx2 = _mm_cvtsi128_si64(ax2);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
-	}
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
 
-	for (size_t i = 0; i < 3; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		REPEAT_3(0, CN_FINALIZE);
 	}
-}
+};
 
-// This even lovelier creation will do 4 cn hashes at a time.
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template< >
+struct Cryptonight_hash<4>
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 4;
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		memset(output, 0, 32 * 4);
-		return;
-	}
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	for (size_t i = 0; i < 4; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+		CN_INIT_SINGLE;
+		REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
 
-	CONST_INIT(ctx[0], 0);
-	CONST_INIT(ctx[1], 1);
-	CONST_INIT(ctx[2], 2);
-	CONST_INIT(ctx[3], 3);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-	uint8_t* l3 = ctx[3]->long_state;
-	uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
-	__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-	__m128i cx3 = _mm_set_epi64x(0, 0);
-
-	uint64_t idx0, idx1, idx2, idx3;
-	idx0 = _mm_cvtsi128_si64(ax0);
-	idx1 = _mm_cvtsi128_si64(ax1);
-	idx2 = _mm_cvtsi128_si64(ax2);
-	idx3 = _mm_cvtsi128_si64(ax3);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2, *ptr3;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
-	}
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
 
-	for (size_t i = 0; i < 4; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		REPEAT_4(0, CN_FINALIZE);
 	}
-}
+};
 
-// This most lovely creation will do 5 cn hashes at a time.
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template< >
+struct Cryptonight_hash<5>
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 5;
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		memset(output, 0, 32 * 5);
-		return;
-	}
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	for (size_t i = 0; i < 5; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+		CN_INIT_SINGLE;
+		REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
 
-	CONST_INIT(ctx[0], 0);
-	CONST_INIT(ctx[1], 1);
-	CONST_INIT(ctx[2], 2);
-	CONST_INIT(ctx[3], 3);
-	CONST_INIT(ctx[4], 4);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-	uint8_t* l3 = ctx[3]->long_state;
-	uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-	uint8_t* l4 = ctx[4]->long_state;
-	uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
-	__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-	__m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
-	__m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-	__m128i cx3 = _mm_set_epi64x(0, 0);
-	__m128i cx4 = _mm_set_epi64x(0, 0);
-
-	uint64_t idx0, idx1, idx2, idx3, idx4;
-	idx0 = _mm_cvtsi128_si64(ax0);
-	idx1 = _mm_cvtsi128_si64(ax1);
-	idx2 = _mm_cvtsi128_si64(ax2);
-	idx3 = _mm_cvtsi128_si64(ax3);
-	idx4 = _mm_cvtsi128_si64(ax4);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
-		CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
-		CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4);
-	}
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
 
-	for (size_t i = 0; i < 5; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		REPEAT_5(0, CN_FINALIZE);
 	}
-}
+};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index a8452ebb1..93ce218a3 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -234,7 +234,7 @@ bool minethd::self_test()
 
 	unsigned char out[32 * MAX_N];
 	cn_hash_fun hashf;
-	cn_hash_fun_multi hashf_multi;
+	cn_hash_fun hashf_multi;
 
 	xmrstak_algo algo = xmrstak_algo::invalid_algo;
 
@@ -248,37 +248,37 @@ bool minethd::self_test()
 		if(algo == cryptonight)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf("This is a test", 14, out, ctx[0]);
+			hashf("This is a test", 14, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-			hashf("This is a test", 14, out, ctx[0]);
+			hashf("This is a test", 14, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
-			hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
 			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
 
-			hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
 			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
 
-			hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
 			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
 
-			hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
 			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
 
-			hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
 			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
@@ -289,7 +289,7 @@ bool minethd::self_test()
 		else if(algo == cryptonight_lite)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
@@ -298,71 +298,71 @@ bool minethd::self_test()
 		else if(algo == cryptonight_monero)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
 		else if(algo == cryptonight_aeon)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 		}
 		else if(algo == cryptonight_ipbc)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
 		}
 		else if(algo == cryptonight_stellite)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 		}
 		else if(algo == cryptonight_masari)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 		}
 		else if(algo == cryptonight_heavy)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 		}
 		else if(algo == cryptonight_haven)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
-			hashf("This is a test This is a test This is a test", 44, out, ctx[0]);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 		}
 		else if(algo == cryptonight_bittube2)
@@ -372,13 +372,13 @@ bool minethd::self_test()
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
 
-			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]);
+			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
 
-			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]);
+			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx);
 			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
 
-			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]);
+			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
 			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
 		}
 
@@ -438,8 +438,10 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 	return pvThreads;
 }
 
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+template<size_t N>
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
 {
+	static_assert(N >= 1, "number of threads must be >= 1" );
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
@@ -483,46 +485,55 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr
 	}
 
 	static const cn_hash_fun func_table[] = {
-		cryptonight_hash<cryptonight_monero, false, false>,
-		cryptonight_hash<cryptonight_monero, true, false>,
-		cryptonight_hash<cryptonight_monero, false, true>,
-		cryptonight_hash<cryptonight_monero, true, true>,
-		cryptonight_hash<cryptonight_lite, false, false>,
-		cryptonight_hash<cryptonight_lite, true, false>,
-		cryptonight_hash<cryptonight_lite, false, true>,
-		cryptonight_hash<cryptonight_lite, true, true>,
-		cryptonight_hash<cryptonight, false, false>,
-		cryptonight_hash<cryptonight, true, false>,
-		cryptonight_hash<cryptonight, false, true>,
-		cryptonight_hash<cryptonight, true, true>,
-		cryptonight_hash<cryptonight_heavy, false, false>,
-		cryptonight_hash<cryptonight_heavy, true, false>,
-		cryptonight_hash<cryptonight_heavy, false, true>,
-		cryptonight_hash<cryptonight_heavy, true, true>,
-		cryptonight_hash<cryptonight_aeon, false, false>,
-		cryptonight_hash<cryptonight_aeon, true, false>,
-		cryptonight_hash<cryptonight_aeon, false, true>,
-		cryptonight_hash<cryptonight_aeon, true, true>,
-		cryptonight_hash<cryptonight_ipbc, false, false>,
-		cryptonight_hash<cryptonight_ipbc, true, false>,
-		cryptonight_hash<cryptonight_ipbc, false, true>,
-		cryptonight_hash<cryptonight_ipbc, true, true>,
-		cryptonight_hash<cryptonight_stellite, false, false>,
-		cryptonight_hash<cryptonight_stellite, true, false>,
-		cryptonight_hash<cryptonight_stellite, false, true>,
-		cryptonight_hash<cryptonight_stellite, true, true>,
-		cryptonight_hash<cryptonight_masari, false, false>,
-		cryptonight_hash<cryptonight_masari, true, false>,
-		cryptonight_hash<cryptonight_masari, false, true>,
-		cryptonight_hash<cryptonight_masari, true, true>,
-		cryptonight_hash<cryptonight_haven, false, false>,
-		cryptonight_hash<cryptonight_haven, true, false>,
-		cryptonight_hash<cryptonight_haven, false, true>,
-		cryptonight_hash<cryptonight_haven, true, true>,
-		cryptonight_hash<cryptonight_bittube2, false, false>,
-		cryptonight_hash<cryptonight_bittube2, true, false>,
-		cryptonight_hash<cryptonight_bittube2, false, true>,
-		cryptonight_hash<cryptonight_bittube2, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_monero, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_lite, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_masari, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_haven, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>
 	};
 
 	std::bitset<2> digit;
@@ -532,333 +543,14 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr
 	return func_table[ algv << 2 | digit.to_ulong() ];
 }
 
-void minethd::work_main()
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
 {
-	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
-
-	order_fix.set_value();
-	std::unique_lock<std::mutex> lck(thd_aff_set);
-	lck.release();
-	std::this_thread::yield();
-
-	cryptonight_ctx* ctx;
-	uint64_t iCount = 0;
-	uint64_t* piHashVal;
-	uint32_t* piNonce;
-	job_result result;
-
-	// start with root algorithm and switch later if fork version is reached
-	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
-	ctx = minethd_alloc_ctx();
-
-	piHashVal = (uint64_t*)(result.bResult + 24);
-	piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
-	result.iThreadId = iThreadNo;
-
-	uint8_t version = 0;
-	size_t lastPoolId = 0;
-
-	while (bQuit == 0)
-	{
-		if (oWork.bStall)
-		{
-			/* We are stalled here because the executor didn't find a job for us yet,
-			 * either because of network latency, or a socket problem. Since we are
-			 * raison d'etre of this software it us sensible to just wait until we have something
-			 */
-
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
-				std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-			globalStates::inst().consume_work(oWork, iJobNo);
-			continue;
-		}
-
-		size_t nonce_ctr = 0;
-		constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2
-
-		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
-		memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID));
-
-		if(oWork.bNiceHash)
-			result.iNonce = *piNonce;
-
-		uint8_t new_version = oWork.getVersion();
-		if(new_version != version || oWork.iPoolId != lastPoolId)
-		{
-			coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
-			if(new_version >= coinDesc.GetMiningForkVersion())
-			{
-				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
-			}
-			else
-			{
-				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
-			}
-			result.algorithm = miner_algo;
-			lastPoolId = oWork.iPoolId;
-			version = new_version;
-		}
-
-		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
-		{
-			if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes
-			{
-				uint64_t iStamp = get_timestamp_ms();
-				iHashCount.store(iCount, std::memory_order_relaxed);
-				iTimestamp.store(iStamp, std::memory_order_relaxed);
-			}
-
-			if((nonce_ctr++ & (nonce_chunk-1)) == 0)
-			{
-				globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk);
-				// check if the job is still valid, there is a small posibility that the job is switched
-				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
-					break;
-			}
-
-			*piNonce = result.iNonce;
-
-			hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
-
-			if (*piHashVal < oWork.iTarget)
-				executor::inst()->push_event(ex_event(result, oWork.iPoolId));
-			result.iNonce++;
-
-			std::this_thread::yield();
-		}
-
-		globalStates::inst().consume_work(oWork, iJobNo);
-	}
-
-	cryptonight_free_ctx(ctx);
+	return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo);
 }
 
-minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+void minethd::work_main()
 {
-	// We have two independent flag bits in the functions
-	// therefore we will build a binary digit and select the
-	// function as a two digit binary
-
-	uint8_t algv;
-	switch(algo)
-	{
-	case cryptonight:
-		algv = 2;
-		break;
-	case cryptonight_lite:
-		algv = 1;
-		break;
-	case cryptonight_monero:
-		algv = 0;
-		break;
-	case cryptonight_heavy:
-		algv = 3;
-		break;
-	case cryptonight_aeon:
-		algv = 4;
-		break;
-	case cryptonight_ipbc:
-		algv = 5;
-		break;
-	case cryptonight_stellite:
-		algv = 6;
-		break;
-	case cryptonight_masari:
-		algv = 7;
-		break;
-	case cryptonight_haven:
-		algv = 8;
-		break;
-	case cryptonight_bittube2:
-		algv = 9;
-		break;
-	default:
-		algv = 2;
-		break;
-	}
-
-	static const cn_hash_fun_multi func_table[] = {
-		cryptonight_double_hash<cryptonight_monero, false, false>,
-		cryptonight_double_hash<cryptonight_monero, true, false>,
-		cryptonight_double_hash<cryptonight_monero, false, true>,
-		cryptonight_double_hash<cryptonight_monero, true, true>,
-		cryptonight_triple_hash<cryptonight_monero, false, false>,
-		cryptonight_triple_hash<cryptonight_monero, true, false>,
-		cryptonight_triple_hash<cryptonight_monero, false, true>,
-		cryptonight_triple_hash<cryptonight_monero, true, true>,
-		cryptonight_quad_hash<cryptonight_monero, false, false>,
-		cryptonight_quad_hash<cryptonight_monero, true, false>,
-		cryptonight_quad_hash<cryptonight_monero, false, true>,
-		cryptonight_quad_hash<cryptonight_monero, true, true>,
-		cryptonight_penta_hash<cryptonight_monero, false, false>,
-		cryptonight_penta_hash<cryptonight_monero, true, false>,
-		cryptonight_penta_hash<cryptonight_monero, false, true>,
-		cryptonight_penta_hash<cryptonight_monero, true, true>,
-
-		cryptonight_double_hash<cryptonight_lite, false, false>,
-		cryptonight_double_hash<cryptonight_lite, true, false>,
-		cryptonight_double_hash<cryptonight_lite, false, true>,
-		cryptonight_double_hash<cryptonight_lite, true, true>,
-		cryptonight_triple_hash<cryptonight_lite, false, false>,
-		cryptonight_triple_hash<cryptonight_lite, true, false>,
-		cryptonight_triple_hash<cryptonight_lite, false, true>,
-		cryptonight_triple_hash<cryptonight_lite, true, true>,
-		cryptonight_quad_hash<cryptonight_lite, false, false>,
-		cryptonight_quad_hash<cryptonight_lite, true, false>,
-		cryptonight_quad_hash<cryptonight_lite, false, true>,
-		cryptonight_quad_hash<cryptonight_lite, true, true>,
-		cryptonight_penta_hash<cryptonight_lite, false, false>,
-		cryptonight_penta_hash<cryptonight_lite, true, false>,
-		cryptonight_penta_hash<cryptonight_lite, false, true>,
-		cryptonight_penta_hash<cryptonight_lite, true, true>,
-
-		cryptonight_double_hash<cryptonight, false, false>,
-		cryptonight_double_hash<cryptonight, true, false>,
-		cryptonight_double_hash<cryptonight, false, true>,
-		cryptonight_double_hash<cryptonight, true, true>,
-		cryptonight_triple_hash<cryptonight, false, false>,
-		cryptonight_triple_hash<cryptonight, true, false>,
-		cryptonight_triple_hash<cryptonight, false, true>,
-		cryptonight_triple_hash<cryptonight, true, true>,
-		cryptonight_quad_hash<cryptonight, false, false>,
-		cryptonight_quad_hash<cryptonight, true, false>,
-		cryptonight_quad_hash<cryptonight, false, true>,
-		cryptonight_quad_hash<cryptonight, true, true>,
-		cryptonight_penta_hash<cryptonight, false, false>,
-		cryptonight_penta_hash<cryptonight, true, false>,
-		cryptonight_penta_hash<cryptonight, false, true>,
-		cryptonight_penta_hash<cryptonight, true, true>,
-
-		cryptonight_double_hash<cryptonight_heavy, false, false>,
-		cryptonight_double_hash<cryptonight_heavy, true, false>,
-		cryptonight_double_hash<cryptonight_heavy, false, true>,
-		cryptonight_double_hash<cryptonight_heavy, true, true>,
-		cryptonight_triple_hash<cryptonight_heavy, false, false>,
-		cryptonight_triple_hash<cryptonight_heavy, true, false>,
-		cryptonight_triple_hash<cryptonight_heavy, false, true>,
-		cryptonight_triple_hash<cryptonight_heavy, true, true>,
-		cryptonight_quad_hash<cryptonight_heavy, false, false>,
-		cryptonight_quad_hash<cryptonight_heavy, true, false>,
-		cryptonight_quad_hash<cryptonight_heavy, false, true>,
-		cryptonight_quad_hash<cryptonight_heavy, true, true>,
-		cryptonight_penta_hash<cryptonight_heavy, false, false>,
-		cryptonight_penta_hash<cryptonight_heavy, true, false>,
-		cryptonight_penta_hash<cryptonight_heavy, false, true>,
-		cryptonight_penta_hash<cryptonight_heavy, true, true>,
-
-		cryptonight_double_hash<cryptonight_aeon, false, false>,
-		cryptonight_double_hash<cryptonight_aeon, true, false>,
-		cryptonight_double_hash<cryptonight_aeon, false, true>,
-		cryptonight_double_hash<cryptonight_aeon, true, true>,
-		cryptonight_triple_hash<cryptonight_aeon, false, false>,
-		cryptonight_triple_hash<cryptonight_aeon, true, false>,
-		cryptonight_triple_hash<cryptonight_aeon, false, true>,
-		cryptonight_triple_hash<cryptonight_aeon, true, true>,
-		cryptonight_quad_hash<cryptonight_aeon, false, false>,
-		cryptonight_quad_hash<cryptonight_aeon, true, false>,
-		cryptonight_quad_hash<cryptonight_aeon, false, true>,
-		cryptonight_quad_hash<cryptonight_aeon, true, true>,
-		cryptonight_penta_hash<cryptonight_aeon, false, false>,
-		cryptonight_penta_hash<cryptonight_aeon, true, false>,
-		cryptonight_penta_hash<cryptonight_aeon, false, true>,
-		cryptonight_penta_hash<cryptonight_aeon, true, true>,
-
-		cryptonight_double_hash<cryptonight_ipbc, false, false>,
-		cryptonight_double_hash<cryptonight_ipbc, true, false>,
-		cryptonight_double_hash<cryptonight_ipbc, false, true>,
-		cryptonight_double_hash<cryptonight_ipbc, true, true>,
-		cryptonight_triple_hash<cryptonight_ipbc, false, false>,
-		cryptonight_triple_hash<cryptonight_ipbc, true, false>,
-		cryptonight_triple_hash<cryptonight_ipbc, false, true>,
-		cryptonight_triple_hash<cryptonight_ipbc, true, true>,
-		cryptonight_quad_hash<cryptonight_ipbc, false, false>,
-		cryptonight_quad_hash<cryptonight_ipbc, true, false>,
-		cryptonight_quad_hash<cryptonight_ipbc, false, true>,
-		cryptonight_quad_hash<cryptonight_ipbc, true, true>,
-		cryptonight_penta_hash<cryptonight_ipbc, false, false>,
-		cryptonight_penta_hash<cryptonight_ipbc, true, false>,
-		cryptonight_penta_hash<cryptonight_ipbc, false, true>,
-		cryptonight_penta_hash<cryptonight_ipbc, true, true>,
-
-		cryptonight_double_hash<cryptonight_stellite, false, false>,
-		cryptonight_double_hash<cryptonight_stellite, true, false>,
-		cryptonight_double_hash<cryptonight_stellite, false, true>,
-		cryptonight_double_hash<cryptonight_stellite, true, true>,
-		cryptonight_triple_hash<cryptonight_stellite, false, false>,
-		cryptonight_triple_hash<cryptonight_stellite, true, false>,
-		cryptonight_triple_hash<cryptonight_stellite, false, true>,
-		cryptonight_triple_hash<cryptonight_stellite, true, true>,
-		cryptonight_quad_hash<cryptonight_stellite, false, false>,
-		cryptonight_quad_hash<cryptonight_stellite, true, false>,
-		cryptonight_quad_hash<cryptonight_stellite, false, true>,
-		cryptonight_quad_hash<cryptonight_stellite, true, true>,
-		cryptonight_penta_hash<cryptonight_stellite, false, false>,
-		cryptonight_penta_hash<cryptonight_stellite, true, false>,
-		cryptonight_penta_hash<cryptonight_stellite, false, true>,
-		cryptonight_penta_hash<cryptonight_stellite, true, true>,
-
-		cryptonight_double_hash<cryptonight_masari, false, false>,
-		cryptonight_double_hash<cryptonight_masari, true, false>,
-		cryptonight_double_hash<cryptonight_masari, false, true>,
-		cryptonight_double_hash<cryptonight_masari, true, true>,
-		cryptonight_triple_hash<cryptonight_masari, false, false>,
-		cryptonight_triple_hash<cryptonight_masari, true, false>,
-		cryptonight_triple_hash<cryptonight_masari, false, true>,
-		cryptonight_triple_hash<cryptonight_masari, true, true>,
-		cryptonight_quad_hash<cryptonight_masari, false, false>,
-		cryptonight_quad_hash<cryptonight_masari, true, false>,
-		cryptonight_quad_hash<cryptonight_masari, false, true>,
-		cryptonight_quad_hash<cryptonight_masari, true, true>,
-		cryptonight_penta_hash<cryptonight_masari, false, false>,
-		cryptonight_penta_hash<cryptonight_masari, true, false>,
-		cryptonight_penta_hash<cryptonight_masari, false, true>,
-		cryptonight_penta_hash<cryptonight_masari, true, true>,
-		
-		cryptonight_double_hash<cryptonight_haven, false, false>,
-		cryptonight_double_hash<cryptonight_haven, true, false>,
-		cryptonight_double_hash<cryptonight_haven, false, true>,
-		cryptonight_double_hash<cryptonight_haven, true, true>,
-		cryptonight_triple_hash<cryptonight_haven, false, false>,
-		cryptonight_triple_hash<cryptonight_haven, true, false>,
-		cryptonight_triple_hash<cryptonight_haven, false, true>,
-		cryptonight_triple_hash<cryptonight_haven, true, true>,
-		cryptonight_quad_hash<cryptonight_haven, false, false>,
-		cryptonight_quad_hash<cryptonight_haven, true, false>,
-		cryptonight_quad_hash<cryptonight_haven, false, true>,
-		cryptonight_quad_hash<cryptonight_haven, true, true>,
-		cryptonight_penta_hash<cryptonight_haven, false, false>,
-		cryptonight_penta_hash<cryptonight_haven, true, false>,
-		cryptonight_penta_hash<cryptonight_haven, false, true>,
-		cryptonight_penta_hash<cryptonight_haven, true, true>,
-
-		cryptonight_double_hash<cryptonight_bittube2, false, false>,
-		cryptonight_double_hash<cryptonight_bittube2, true, false>,
-		cryptonight_double_hash<cryptonight_bittube2, false, true>,
-		cryptonight_double_hash<cryptonight_bittube2, true, true>,
-		cryptonight_triple_hash<cryptonight_bittube2, false, false>,
-		cryptonight_triple_hash<cryptonight_bittube2, true, false>,
-		cryptonight_triple_hash<cryptonight_bittube2, false, true>,
-		cryptonight_triple_hash<cryptonight_bittube2, true, true>,
-		cryptonight_quad_hash<cryptonight_bittube2, false, false>,
-		cryptonight_quad_hash<cryptonight_bittube2, true, false>,
-		cryptonight_quad_hash<cryptonight_bittube2, false, true>,
-		cryptonight_quad_hash<cryptonight_bittube2, true, true>,
-		cryptonight_penta_hash<cryptonight_bittube2, false, false>,
-		cryptonight_penta_hash<cryptonight_bittube2, true, false>,
-		cryptonight_penta_hash<cryptonight_bittube2, false, true>,
-		cryptonight_penta_hash<cryptonight_bittube2, true, true>
-	};
-
-	std::bitset<2> digit;
-	digit.set(0, !bHaveAes);
-	digit.set(1, !bNoPrefetch);
-
-	return func_table[algv << 4 | (N-2) << 2 | digit.to_ulong()];
+	multiway_work_main<1u>();
 }
 
 void minethd::double_work_main()
@@ -926,7 +618,7 @@ void minethd::multiway_work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun_multi hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
@@ -961,12 +653,12 @@ void minethd::multiway_work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 2d40ce314..26478542c 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -22,7 +22,7 @@ class minethd : public iBackend
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
 	static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
@@ -30,8 +30,9 @@ class minethd : public iBackend
 	static cryptonight_ctx* minethd_alloc_ctx();
 
 private:
-	typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**);
-	static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
+
+	template<size_t N>
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
 
 	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
 
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 88a1acc32..486a990e3 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -300,7 +300,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index d4ae03864..389356842 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -28,7 +28,7 @@ class minethd : public iBackend
 	static bool self_test();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
 	void start_mining();

From c5ac310a7c65fdb0824c8c813f553bd90fddbb2b Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Sat, 15 Sep 2018 12:13:24 -0600
Subject: [PATCH 13/77] Update `doc/FAQ.md` with unified proper methods for
 Linux limits

---
 doc/FAQ.md | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/doc/FAQ.md b/doc/FAQ.md
index 50897659f..2d2820166 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -45,20 +45,35 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin
 
 ## Error: MEMORY ALLOC FAILED: mmap failed
 
-On Linux you will need to configure large page support and increase your ulimit -l.
+On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`).
 
-To set large page support, add the following lines to `/etc/sysctl.conf` (`/etc/sysctl.d/xmr-stak.conf` for [Arch Linux](https://www.archlinux.org/news/deprecation-of-etcsysctlconf/) and its derivatives):
+Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time).  Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work).
+
+To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults):
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+    vm.nr_hugepages = 0
+    16
+
+To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`:
 
     vm.nr_hugepages=128
 
-To increase the ulimit, add following lines to `/etc/security/limits.conf`:
+You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot).  In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits)
 
-    * soft memlock 262144
-    * hard memlock 262144
+To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`:
+
+    *    - memlock 262144
+    root - memlock 262144
 
 You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).
+Recheck after completing these steps to validate:
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+    vm.nr_hugepages = 128
+    262144
 
-You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.
+You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.  Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly).
 
 ## Illegal Instruction
 

From 8a2f294d20b396aec08ab0b333ed25f3011c36fc Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 16 Sep 2018 20:24:50 +0200
Subject: [PATCH 14/77] fix that type of `memChunk` is not tested

There is a copy past mistake tha tthe type of the variable `memChunk` is not tested.
---
 xmrstak/backend/amd/jconf.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0f39ff2b9..9e15c930c 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -142,14 +142,14 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
-	cfg.memChunk = (int)memChunk->GetInt64();
-
-	if(!idx->IsUint64() || cfg.memChunk > 18 )
+	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 )
 	{
 		printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
 		return false;
 	}
 
+	cfg.memChunk = (int)memChunk->GetInt64();
+
 	if(!compMode->IsBool())
 		return false;
 

From 2742ef094c6492b881b9fe0dc563e939e0d7d1d9 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 17 Sep 2018 08:44:05 +0200
Subject: [PATCH 15/77] avoid OpenCL binary missmatch

Avoid that a OpenCL binary from the cache is used if the driver or xmr-stak version has changed.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 87721ac8f..dedc32692 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -17,6 +17,7 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/version.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -375,6 +376,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
+	std::vector<char> openCLDriverVer(1024);
+	if(ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
+		return ERR_OCL_API;
+	}
+
 	xmrstak_algo miner_algo[2] = {
 		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
 		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
@@ -402,6 +410,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		std::string src_str(source_code);
 		src_str += options;
 		src_str += devNameVec.data();
+		src_str += get_version_str();
+		src_str += openCLDriverVer.data();
+
 		std::string hash_hex_str;
 		picosha2::hash256_hex_string(src_str, hash_hex_str);
 

From 77160cf13a2beaf23c6fa2fad5180080b66583a0 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 19 Sep 2018 11:54:45 +0200
Subject: [PATCH 16/77] fix nicehash `invalid results`

If the first bit of the nonce is `1` (this is very often if we use a nicehash pool)
than it could be that some OpenCL implementations handle the 64bit representation of the 32bit
nonce on the device side as signed integer.
During a right bitshift we pull wrong ones from the wrong higher part of the 64bit
nonce representation into the 32bit part of the nonce.
The result will be that the computed share is invalid.

- explicit cast the nonce on the device to `uint` to avoid any side effects
---
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 002472d3a..78cd30c3a 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -482,9 +482,14 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 		State[10] = input[10];
 
 		((uint *)State)[9] &= 0x00FFFFFFU;
-		((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
+		((uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
 		((uint *)State)[10] &= 0xFF000000U;
-		((uint *)State)[10] |= ((get_global_id(0) >> 8));
+		/* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+		 * handle get_global_id and get_global_offset as signed long long int and add
+		 * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+		 * (even if it is correct casted to unsigned on the host)
+		 */
+		((uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
 
 		for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
 
@@ -605,7 +610,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		tweak1_2 = as_uint2(input[4]);
 		tweak1_2.s0 >>= 24;
 		tweak1_2.s0 |= tweak1_2.s1 << 8;
-		tweak1_2.s1 = get_global_id(0);
+		tweak1_2.s1 = (uint)get_global_id(0);
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
 	}
@@ -918,7 +923,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -994,7 +999,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
@@ -1072,7 +1077,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
@@ -1133,7 +1138,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }

From 16da98867769892392d0308d93e989748f3dab4c Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 19 Sep 2018 13:00:04 +0200
Subject: [PATCH 17/77] OpenCL: avoid out of memory access

During the initialization of the compile parameter for OpenCL it could be that the
fixed size buffer is to small. To avoid this we are now using `std::string`.
There is no problem by using `std::string` because this part of code is not perfromance critical.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 87721ac8f..273010800 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -388,11 +388,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		int threadMemMask = cn_select_mask(miner_algo[ii]);
 		int hashIterations = cn_select_iter(miner_algo[ii]);
 
-		char options[512];
-		snprintf(options, sizeof(options),
-			"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
-		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
-			int_port(hashMemSize), int(miner_algo[ii]));
+		std::string options;
+		options += " -DITERATIONS=" + std::to_string(hashIterations);
+		options += " -DMASK=" + std::to_string(threadMemMask);
+		options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
+		options += " -DSTRIDED_INDEX=" + std::to_string(ctx->stridedIndex);
+		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(1u << ctx->memChunk);
+		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
+		options += " -DMEMORY=" + std::to_string(hashMemSize);
+		options += " -DALGO=" + std::to_string(miner_algo[ii]);
+
 		/* create a hash for the compile time cache
 		 * used data:
 		 *   - source code
@@ -418,7 +423,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				return ERR_OCL_API;
 			}
 
-			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL);
+			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
 				size_t len;

From b751af9448f658e36aca884af8dd26a34f618195 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 10 Sep 2018 08:01:57 +0200
Subject: [PATCH 18/77] introduce `cryptonight_v8` and `monero8`

- rmeove currency `monero7`
- introduce `cryptonight_v8` and `monero8`
---
 xmrstak/backend/cpu/minethd.cpp | 17 ++++++++++++++++-
 xmrstak/backend/cryptonight.hpp | 13 +++++++++++++
 xmrstak/jconf.cpp               |  3 ++-
 xmrstak/misc/executor.cpp       |  2 +-
 xmrstak/net/jpsock.cpp          |  3 +++
 5 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 93ce218a3..e11c82009 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -305,6 +305,16 @@ bool minethd::self_test()
 			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
+		else if(algo == cryptonight_monero_v8)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+		}
 		else if(algo == cryptonight_aeon)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
@@ -533,7 +543,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>
 	};
 
 	std::bitset<2> digit;
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index b6f656138..6b1afa928 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -16,6 +16,7 @@ enum xmrstak_algo
 	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+	cryptonight_monero_v8 = 11
 };
 
 // define aeon settings
@@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_monero>() { return CRYPTONIGHT_MEMORY; }
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_monero_v8>() { return CRYPTONIGHT_MEMORY; }
+
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MEMORY; }
 
@@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
 		return CRYPTONIGHT_MEMORY;
@@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_monero>() { return CRYPTONIGHT_MASK; }
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_monero_v8>() { return CRYPTONIGHT_MASK; }
+
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MASK; }
 
@@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
 		return CRYPTONIGHT_MASK;
@@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_monero>() { return CRYPTONIGHT_ITER; }
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_monero_v8>() { return CRYPTONIGHT_ITER; }
+
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_ITER; }
 
@@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight:
 		return CRYPTONIGHT_ITER;
 	case cryptonight_ipbc:
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index b6580ea9a..609b55f72 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -99,12 +99,13 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr },
 	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr },
 	{ "cryptonight_v7",      {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr },
 	{ "graft",               {cryptonight_monero, cryptonight, 8u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "haven",               {cryptonight_haven, cryptonight_heavy, 3u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "intense",             {cryptonight_monero, cryptonight, 4u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "masari",              {cryptonight_masari, cryptonight_monero, 7u},   {cryptonight_monero, cryptonight_monero, 0u},nullptr },
-	{ "monero7",             {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" },
+	{ "monero8",             {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
 	{ "qrl",             	 {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "stellite",            {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index 11d0f6df0..02ac8b7f5 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -560,7 +560,7 @@ void executor::ex_main()
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true);
 		break;
-
+	case cryptonight_monero_v8:
 	case cryptonight_monero:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false);
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 9fce9b7e5..d20ba082f 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		case cryptonight_monero:
 			algo_name = "cryptonight_v7";
 			break;
+		case cryptonight_monero_v8:
+			algo_name = "cryptonight_v8";
+			break;
 		case cryptonight_aeon:
 			algo_name = "cryptonight_lite_v7";
 			break;

From 69f550cb72c4fe22aa6ea6ca8f477559e1899a14 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 10 Sep 2018 08:05:59 +0200
Subject: [PATCH 19/77] CPU: cryptonight_v8

Add support for single hash cryptonight_v8.

Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com>
---
 .../backend/cpu/crypto/cryptonight_aesni.h    | 135 ++++++++++++++----
 xmrstak/backend/cpu/minethd.cpp               |   3 +
 2 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 89c508990..273476096 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -19,6 +19,7 @@
 #include "xmrstak/backend/cryptonight.hpp"
 #include <memory.h>
 #include <stdio.h>
+#include <cfenv>
 
 #ifdef __GNUC__
 #include <x86intrin.h>
@@ -422,6 +423,27 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	_mm_store_si128(output + 11, xout7);
 }
 
+inline __m128i int_sqrt33_1_double_precision(const uint64_t n0)
+{
+	__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+	x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+	uint64_t r = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+	const uint64_t s = r >> 20;
+	r >>= 19;
+
+	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+
+#if defined _MSC_VER || (__GNUC__ >= 7)
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#else
+	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
+ 	// Fallback to simpler code
+ 	if (x2 < n0) ++r;
+#endif
+	return _mm_cvtsi64_si128(r);
+}
+
 inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 {
 	alignas(16) uint32_t k[4];
@@ -467,6 +489,51 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 }
 
+inline void set_float_rounding_mode()
+{
+#ifdef _MSC_VER
+	_control87(RC_DOWN, MCW_RC);
+#else
+	std::fesetround(FE_DOWNWARD);
+#endif
+}
+
+#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
+
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm)); \
+		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
+		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \
+		const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \
+		/* Most and least significant bits in the divisor are set to 1 \
+		 * to make sure we don't divide by a small or even number, \
+		 * so there are no shortcuts for such cases \
+		 * \
+		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
+		 * We drop the highest bit to fit both quotient and remainder in 32 bits \
+		 */  \
+		/* Compiler will optimize it to a single div instruction */ \
+		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
+		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
+		sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \
+	}
+
 #define CN_INIT_SINGLE \
 	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
 	{ \
@@ -474,7 +541,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 		return; \
 	}
 
-#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \
 	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
 	uint64_t monero_const; \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -489,16 +556,27 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 	uint64_t idx0; \
 	__m128i bx0; \
 	uint8_t* l0 = ctx[n]->long_state; \
+	/* BEGIN cryptonight_monero_v8 variables */ \
+	__m128i bx1; \
+	__m128i division_result_xmm; \
+	__m128i sqrt_result_xmm; \
+	/* END cryptonight_monero_v8 variables */ \
 	{ \
 		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
 		idx0 = h0[0] ^ h0[4]; \
 		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
 		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
+		if(ALGO == cryptonight_monero_v8) \
+		{ \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
+			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
+			sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \
+			set_float_rounding_mode(); \
+		} \
 	} \
 	__m128i *ptr0
 
-
-#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
 	__m128i cx; \
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	cx = _mm_load_si128(ptr0); \
@@ -512,7 +590,8 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 			cx = soft_aesenc(cx, ax0); \
 		else \
 			cx = _mm_aesenc_si128(cx, ax0); \
-	}
+	} \
+	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1)
 
 #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -524,15 +603,22 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	bx0 = cx; \
+	if(ALGO != cryptonight_monero_v8) \
+		bx0 = cx
 
-#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \
 	uint64_t lo, cl, ch; \
 	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
 	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
 	cl = ((uint64_t*)ptr0)[0]; \
 	ch = ((uint64_t*)ptr0)[1]; \
-	\
+	CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \
+	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		bx1 = bx0; \
+		bx0 = cx; \
+	} \
 	{ \
 		uint64_t hi; \
 		lo = _umul128(idx0, cl, &hi); \
@@ -542,7 +628,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 	((uint64_t*)ptr0)[0] = al0; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
-	
 
 #define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
 	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -622,6 +707,9 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 #define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
 #define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
 #define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
 
 /** repeat a macro call multiple times
  *
@@ -657,15 +745,14 @@ struct Cryptonight_hash<1>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-
-			REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -687,14 +774,14 @@ struct Cryptonight_hash<2>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -716,14 +803,14 @@ struct Cryptonight_hash<3>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -745,14 +832,14 @@ struct Cryptonight_hash<4>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -774,14 +861,14 @@ struct Cryptonight_hash<5>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index e11c82009..87f4d3285 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -489,6 +489,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_bittube2:
 		algv = 9;
 		break;
+	case cryptonight_monero_v8:
+		algv = 10;
+		break;
 	default:
 		algv = 2;
 		break;

From 5608f8df39504e69c2c1aaaa8ff5e60a83b06ee4 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 10 Sep 2018 08:30:36 +0200
Subject: [PATCH 20/77] OpenCl: cryptonight_v8

- implement cryptonight_v8
- update auto adjust to fit the special requirements of `cryptonight_v8`
- add fast math integer implementation for `sqrt`, `reciprocal`  and `division`

Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com>
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp           |  27 +++-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 138 ++++++++++++++----
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    | 136 +++++++++++++++++
 xmrstak/backend/amd/autoAdjust.hpp            |  20 ++-
 4 files changed, 290 insertions(+), 31 deletions(-)
 create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 8d9b66853..bb39c5764 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -901,6 +901,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	//char* source_code = LoadTextFile(sSourcePath);
 
+	const char *fastIntMathV2CL =
+			#include "./opencl/fast_int_math_v2.cl"
+	;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -921,6 +924,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	;
 
 	std::string source_code(cryptonightCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -930,16 +934,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
+	// check if cryptonight_monero_v8 is selected for the user or dev pool
+	bool useCryptonight_v8 =
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
 		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
 		{
 			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
 			ctx[i].rawIntensity = reduced_intensity;
-			const std::string backendName = xmrstak::params::inst().openCLVendor;
 			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
 		}
 
+		if(useCryptonight_v8)
+		{
+			if(ctx[i].stridedIndex == 1)
+			{
+				printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
+				return ERR_STUPID_PARAMS;
+			}
+			if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2)
+			{
+				printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
+				return ERR_STUPID_PARAMS;
+			}
+		}
+
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 78cd30c3a..778c8d5ba 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 }
 #endif
 
+//#include "opencl/fast_int_math_v2.cl"
+XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "opencl/wolf-aes.cl"
 XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
@@ -556,6 +558,8 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
+
+#define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
 		
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
@@ -565,9 +569,24 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 )
 {
-	ulong a[2], b[2];
+	ulong a[2];
+
+// cryptonight_monero_v8
+#if(ALGO==11)		
+	ulong b[4];
+	uint4 b_x[2];
+#else
+	ulong b[2];
+	uint4 b_x[1];
+#endif
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
+// cryptonight_monero_v8
+#if(ALGO==11)
+	__local uint RCP[256];
+	uint2 division_result;
+	uint sqrt_result;
+#endif
 	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
@@ -577,6 +596,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		AES1[i] = rotate(tmp, 8U);
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
+// cryptonight_monero_v8
+#if(ALGO==11)
+		RCP[i] = RCP_C[i];
+#endif
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
@@ -584,7 +607,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
     uint2 tweak1_2;
 #endif
-	uint4 b_x;
+
 #if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
@@ -604,7 +627,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		a[1] = states[1] ^ states[5];
 		b[1] = states[3] ^ states[7];
 
-		b_x = ((uint4 *)b)[0];
+		b_x[0] = ((uint4 *)b)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+		a[1] = states[1] ^ states[5];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
+		b_x[1] = ((uint4 *)b)[1];
+		division_result = as_uint2(states[12]);
+		sqrt_result = as_uint2(states[13]).s0;
+#endif
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 		tweak1_2 = as_uint2(input[4]);
@@ -622,37 +655,81 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	if(gIdx < Threads)
 #endif
 	{
-		ulong idx0 = a[0];
+		ulong idx0 = a[0] & MASK;
 
 		#pragma unroll 8
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
 
-			((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
+			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 // cryptonight_bittube2
 #if(ALGO == 10)
 			((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
 			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
-			b_x ^= ((uint4 *)c)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
+
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 			uint table = 0x75310U;
+			b_x[0] ^= ((uint4 *)c)[0];
 // cryptonight_stellite
 #	if(ALGO == 7)
-			uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2);
+			uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
 #	else
-			uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
+			uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
 #	endif
-			b_x.s2 ^= ((table >> index) & 0x30U) << 24;
+			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+// cryptonight_monero_v8
+#elif(ALGO==11)
+			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
+#else
+			b_x[0] ^= ((uint4 *)c)[0];
+			SCRATCHPAD_CHUNK(0) = b_x[0];
 #endif
-			Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
-
+			idx0 = c[0] & MASK;
 			uint4 tmp;
-			tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
-
+			tmp = SCRATCHPAD_CHUNK(0);
+// cryptonight_monero_v8
+#if(ALGO==11)
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
+ 			// Most and least significant bits in the divisor are set to 1
+			// to make sure we don't divide by a small or even number,
+			// so there are no shortcuts for such cases
+			const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
+ 			// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+			// We drop the highest bit to fit both quotient and remainder in 32 bits
+			division_result = fast_div_v2(RCP, c[1], d);
+ 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
+#endif
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
 			a[1] += c[0] * as_ulong2(tmp).s0;
 			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
 
@@ -663,39 +740,42 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	if(ALGO == 6 || ALGO == 10)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 #	else
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
 #	endif
 
 #else
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 #endif
 
 			((uint4 *)a)[0] ^= tmp;
-			idx0 = a[0];
-
-			b_x = ((uint4 *)c)[0];
+			idx0 = a[0] & MASK;
 
 // cryptonight_heavy || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 10)
-			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = n / (d | 0x5);
-			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
-			idx0 = d ^ q;
-#endif
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = (d ^ q) & MASK;
 // cryptonight_haven
-#if (ALGO == 9)
-			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+#elif (ALGO == 9)
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = n / (d | 0x5);
-			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
-			idx0 = (~d) ^ q;
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = ((~d) ^ q) & MASK;
+#endif
+		
+// cryptonight_monero_v8
+#if (ALGO == 11)
+			b_x[1] = b_x[0];
 #endif
+			b_x[0] = ((uint4 *)c)[0];
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
new file mode 100644
index 000000000..fe7cea1ee
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -0,0 +1,136 @@
+R"===(
+/*
+ * @author SChernykh
+ */
+static const __constant uint RCP_C[256] =
+{
+	0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu,
+	0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au,
+	0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu,
+	0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u,
+	0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u,
+	0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u,
+	0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u,
+	0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u,
+	0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u,
+	0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u,
+	0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u,
+	0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u,
+	0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au,
+	0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u,
+	0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u,
+	0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu,
+	0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u,
+	0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u,
+	0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u,
+	0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u,
+	0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu,
+	0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u,
+	0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u,
+	0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u,
+	0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u,
+	0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu,
+	0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau,
+	0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u,
+	0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u,
+	0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu,
+	0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u,
+	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
+};
+
+inline uint get_reciprocal(const __local uchar *RCP, uint a)
+{
+	const uint index1 = (a & 0x7F000000U) >> 21;
+	const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768;
+
+	const uint r1 = *(const __local uint*)(RCP + index1);
+
+	uint r2_0 = *(const __local uint*)(RCP + index1 + 4);
+	if (index2 > 0) r2_0 >>= 16;
+	const int r2 = r2_0 & 0xFFFFU;
+
+	const uint r = r1 - (uint)(mul24(r2, index2) >> 6);
+
+	const ulong lo0 = (ulong)(r) * a;
+	ulong lo = lo0 + ((ulong)(a) << 32);
+
+	a >>= 1;
+	const bool b = (a >= lo) || (lo >= lo0);
+	lo = a - lo;
+
+	const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo;
+	return as_uint2(k).s1 + (b ? r : 0);
+}
+
+inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
+{
+	const uint r = get_reciprocal((const __local uchar *)RCP, b);
+	const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
+
+	ulong q;
+	((uint*)&q)[0] = as_uint2(k).s1;;
+	((uint*)&q)[1] = (k < a) ? 1 : 0;
+
+	const long tmp = a - q * b;
+	const bool overshoot = (tmp < 0);
+	const bool undershoot = (tmp >= b);
+
+	return (uint2)(
+		as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
+		as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
+	);
+}
+
+inline void fast_div_full_q(const __local uint *RCP, ulong a, uint b, ulong *q, uint *r)
+{
+	const uint rcp = get_reciprocal((const __local uchar *)RCP, b);
+	const ulong k = mul_hi(as_uint2(a).s0, rcp) + ((ulong)(as_uint2(a).s1) * rcp) + a;
+
+	((uint*)q)[0] = as_uint2(k).s1;
+	((uint*)q)[1] = (k < a) ? 1 : 0;
+
+	long tmp = a - (*q) * b;
+
+	const bool overshoot = (tmp < 0);
+	const bool undershoot = (tmp >= b);
+
+	if (overshoot)
+	{
+		--(*q);
+		tmp += b;
+	}
+
+	if (undershoot)
+	{
+		++(*q);
+		tmp -= b;
+	}
+
+	*r = tmp;
+}
+
+inline uint fast_sqrt_v2(const ulong n1)
+{
+	float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+
+	float x1 = native_rsqrt(x);
+	x = native_sqrt(x);
+
+	// The following line does x1 *= 4294967296.0f;
+	x1 = as_float(as_uint(x1) + (32U << 23));
+
+	const uint x0 = as_uint(x) - (158U << 23);
+	const long delta0 = n1 - (((long)(x0) * x0) << 18);
+	const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+
+	uint result = (x0 << 10) + convert_int_rte(delta);
+	const uint s = result >> 1;
+	const uint b = result & 1;
+
+	const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
+	if ((long)(x2 + b) > 0) --result;
+	if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+
+	return result;
+}
+)==="
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index d6acec971..4a2ffdb19 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -127,6 +127,24 @@ class autoAdjust
 				minFreeMem = 512u * byteToMiB;
 			}
 
+			// check if cryptonight_monero_v8 is selected for the user or dev pool
+			bool useCryptonight_v8 =
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+			// set strided index to default
+			ctx.stridedIndex = 1;
+
+			// nvidia performance is very bad if the scratchpad is not contiguous
+			if(ctx.isNVIDIA)
+				ctx.stridedIndex = 0;
+
+			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
+			if(useCryptonight_v8)
+				ctx.stridedIndex = 2;
+
 			// increase all intensity limits by two for aeon
 			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
@@ -153,7 +171,7 @@ class autoAdjust
 				// set 8 threads per block (this is a good value for the most gpus)
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n"
+					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
 					"    \"comp_mode\" : true\n" +
 					"  },\n";
 			}

From d035dbc160de3df3a800e872c37453b1d277db2b Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 10 Sep 2018 08:35:00 +0200
Subject: [PATCH 21/77] NVIDIA: cryptonight_v8

implement `cryptonight_v8`
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 164 +++++++++++++++++-
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |  25 ++-
 2 files changed, 184 insertions(+), 5 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 6c6475150..3e6279288 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -194,6 +194,31 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #endif
 }
 
+template<size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2)
+{
+	uint64_t tmp;
+	((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src);
+	((uint32_t*)&tmp)[1] = shuffle<group_n>(ptr, sub, val, src2);
+	return tmp;
+}
+
+__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0)
+{
+	uint64_t x = (n0 >> 12) + (1023ULL << 52);
+	const double xx = sqrt( *reinterpret_cast<double*>(&x) );
+	uint64_t r = *reinterpret_cast<const uint64_t*>(&xx);
+
+	const uint64_t s = r >> 20;
+	r >>= 19;
+
+	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+
+ 	if (x2 < n0) ++r;
+
+	return r;
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
@@ -227,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 	uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
-	uint32_t a, d[2], idx0;
+	uint32_t a, a1, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
 	uint32_t tweak1_2[2];
@@ -250,7 +275,19 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			idx0 = *(d_ctx_b + threads * 4 + thread);
 		}
 	}
-	d[1] = (d_ctx_b + thread * 4)[sub];
+
+	uint32_t bx1, division_result, sqrt_result;
+	if(ALGO == cryptonight_monero_v8)
+	{
+		d[1] = (d_ctx_b + thread * 12)[sub];
+		bx1 = (d_ctx_b + thread * 12 + 4)[sub];
+
+		// must be valid only for `sub < 2`
+		division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2];
+		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2];
+	}
+	else
+		d[1] = (d_ctx_b + thread * 4)[sub];
 
 	#pragma unroll 2
 	for ( i = start; i < end; ++i )
@@ -296,6 +333,10 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
 				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
 				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
+				if(ALGO == cryptonight_monero_v8)
+				{
+					a1 = a;
+				}
 				d[x] = a ^
 					t_fn0( x_0 & 0xff ) ^
 					t_fn1( (x_1 >> 8) & 0xff ) ^
@@ -303,6 +344,33 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					t_fn3( ( x_3 >> 24 ) );
 			}
 
+			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
+			if(ALGO == cryptonight_monero_v8)
+			{
+				// Shuffle constants here were chosen carefully
+				// to maximize permutation cycle length
+				// and have no 2-byte elements stay in their places
+				const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10)  );
+				const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20)  );
+				const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30)  );
+
+				uint32_t src = sub & 2;
+				const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
+				const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
+				const uint64_t cc3 = bx1_64 + chunk3_64;
+				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
+
+				const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
+				const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
+				const uint64_t cc1 = bx0_64 + chunk1_64;
+				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
+
+				const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
+				const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
+				const uint64_t cc2 = ax0_64 + chunk2_64;
+				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
+				
+			}
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
 			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
 
@@ -331,10 +399,76 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 			uint32_t yy[2];
 			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
+
+			if(ALGO == cryptonight_monero_v8 )
+			{
+				const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1);
+
+				// Use division and square root results from the _previous_ iteration to hide the latency
+				const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);
+
+
+				const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
+				const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32);
+
+				if(sub < 2)
+					*((uint64_t*)yy) ^= cl_rhs;
+
+
+				const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL;
+
+				// Most and least significant bits in the divisor are set to 1
+				// to make sure we don't divide by a small or even number,
+				// so there are no shortcuts for such cases
+				//
+				// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+				// We drop the highest bit to fit both quotient and remainder in 32 bits
+
+				// Compiler will optimize it to a single div instruction
+				const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
+
+
+				const uint64_t division_result_tmp = static_cast<uint32_t>(cx1 / dd) + ((cx1 % dd) << 32);
+
+				division_result = ((uint32_t*)&division_result_tmp)[sub % 2];
+								
+				// Use division_result as an input for the square root to prevent parallel implementation in hardware
+				const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp);
+				sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2];
+			}
+
 			uint32_t zz[2];
 			zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
 			zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
-
+			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
+			if(ALGO == cryptonight_monero_v8)
+			{
+				// Shuffle constants here were chosen carefully
+				// to maximize permutation cycle length
+				// and have no 2-byte elements stay in their places
+				const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10)  );
+				const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20)  );
+				const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30)  );
+
+				uint32_t src = sub & 2;
+				const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
+				const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
+				const uint64_t cc3 = bx1_64 + chunk3_64;
+				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
+
+
+
+				const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
+				const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
+				const uint64_t cc1 = bx0_64 + chunk1_64;
+				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
+
+				const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
+				const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
+				const uint64_t cc2 = ax0_64 + chunk2_64;
+				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
+			}
+			
 			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
 			#pragma unroll
 			for ( k = 0; k < 2; k++ )
@@ -384,13 +518,31 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 				idx0 = (~d) ^ q;
 			}
+			if(ALGO == cryptonight_monero_v8)
+			{
+				bx1 = d[(x + 1) % 2];
+			}
 		}
 	}
 
 	if ( bfactor > 0 )
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
-		(d_ctx_b + thread * 4)[sub] = d[1];
+		if(ALGO == cryptonight_monero_v8)
+		{
+			(d_ctx_b + thread * 12)[sub] = d[1];
+			(d_ctx_b + thread * 12 + 4)[sub] = bx1;
+
+			if(sub < 2)
+			{
+				// must be valid only for `sub < 2`
+				(d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result;
+				(d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result;
+			}
+		}
+		else
+			(d_ctx_b + thread * 4)[sub] = d[1];
+			
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
 			if(sub&1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
@@ -534,6 +686,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce);
 	}
+	else if(miner_algo == cryptonight_monero_v8)
+	{
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8>(ctx, startNonce);
+	}
 	else if(miner_algo == cryptonight_heavy)
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index b455f55ca..1ea54ddba 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+	if(ALGO == cryptonight_monero_v8)
+	{
+		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
+		// bx1
+		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
+		memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 );
+		// division_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 );
+		// sqrt_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+	}
+	else
+		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
 
 	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
 	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
@@ -298,6 +310,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
+	else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	{
+		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
+		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
+	}
 	else
 		ctx->d_ctx_state2 = ctx->d_ctx_state;
 
@@ -340,6 +358,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	if(miner_algo == cryptonight_monero_v8)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,

From 522ff6a67b222a3584964ac7a75e53da6187c279 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 14 Sep 2018 20:48:13 +0200
Subject: [PATCH 22/77] NVIDIA: optimize shuffle

- use shared memory to exchange
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 141 +++++++++++-------
 1 file changed, 83 insertions(+), 58 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 3e6279288..1273f89e9 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -252,7 +252,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 	uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
-	uint32_t a, a1, d[2], idx0;
+	uint32_t a, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
 	uint32_t tweak1_2[2];
@@ -296,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		for ( int x = 0; x < 2; ++x )
 		{
 			j = ( ( idx0 & MASK ) >> 2 ) + sub;
-
+			
 			if(ALGO == cryptonight_bittube2)
 			{
 				uint32_t k[4];
@@ -327,50 +327,69 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					}
 				}
 			}
+			else if(ALGO == cryptonight_monero_v8)
+			{
+
+				const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
+				uint4 chunk0{};
+				chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0);
+				chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0);
+				chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0);
+				chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0);
+
+				const uint32_t x_0 = ((uint32_t*)&chunk0)[sub];
+				const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4];
+				const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4];
+				const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4];
+				d[x] = a ^
+					t_fn0( x_0 & 0xff ) ^
+					t_fn1( (x_1 >> 8) & 0xff ) ^
+					t_fn2( (x_2 >> 16) & 0xff ) ^
+					t_fn3( ( x_3 >> 24 ) );
+
+				uint4 value;
+				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
+				if(sub == 1)
+					((uint64_t*)&value)[0] = tmp10;
+				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
+				if(sub == 1)
+					((uint64_t*)&value)[1] = tmp20;
+				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
+				if(sub == 2)
+					((uint64_t*)&value)[0] = tmp11;
+				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
+				if(sub == 2)
+					((uint64_t*)&value)[1] = tmp21;
+				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
+				if(sub == 3)
+					((uint64_t*)&value)[0] = tmp12;
+				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
+				if(sub == 3)
+					((uint64_t*)&value)[1] = tmp22;
+
+				if(sub > 0)
+				{
+					uint4 store{};
+					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
+					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
+
+					const int dest = sub + 1;
+					const int dest2 = dest == 4 ? 1 : dest;
+					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
+				}
+			}
 			else
 			{
 				const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
 				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
 				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
 				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
-				if(ALGO == cryptonight_monero_v8)
-				{
-					a1 = a;
-				}
 				d[x] = a ^
 					t_fn0( x_0 & 0xff ) ^
 					t_fn1( (x_1 >> 8) & 0xff ) ^
 					t_fn2( (x_2 >> 16) & 0xff ) ^
 					t_fn3( ( x_3 >> 24 ) );
 			}
-
-			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
-			if(ALGO == cryptonight_monero_v8)
-			{
-				// Shuffle constants here were chosen carefully
-				// to maximize permutation cycle length
-				// and have no 2-byte elements stay in their places
-				const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10)  );
-				const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20)  );
-				const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30)  );
-
-				uint32_t src = sub & 2;
-				const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
-				const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
-				const uint64_t cc3 = bx1_64 + chunk3_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
-
-				const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
-				const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
-				const uint64_t cc1 = bx0_64 + chunk1_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
-
-				const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
-				const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
-				const uint64_t cc2 = ax0_64 + chunk2_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
-				
-			}
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
 			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
 
@@ -443,30 +462,36 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
 			if(ALGO == cryptonight_monero_v8)
 			{
-				// Shuffle constants here were chosen carefully
-				// to maximize permutation cycle length
-				// and have no 2-byte elements stay in their places
-				const uint32_t chunk1 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10)  );
-				const uint32_t chunk2 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20)  );
-				const uint32_t chunk3 = loadGlobal32<uint32_t>( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30)  );
-
-				uint32_t src = sub & 2;
-				const uint64_t bx1_64 = shuffle64<4>(sPtr,sub, bx1, src, src | 1);
-				const uint64_t chunk3_64 = shuffle64<4>(sPtr,sub, chunk3, src, src | 1);
-				const uint64_t cc3 = bx1_64 + chunk3_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x10), ((uint32_t*)&cc3)[sub & 1]);
-
-
-
-				const uint64_t bx0_64 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], src, src | 1);
-				const uint64_t chunk1_64 = shuffle64<4>(sPtr,sub, chunk1, src, src | 1);
-				const uint64_t cc1 = bx0_64 + chunk1_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x20), ((uint32_t*)&cc1)[sub & 1]);
-
-				const uint64_t ax0_64 = shuffle64<4>(sPtr,sub, a1, src, src | 1);
-				const uint64_t chunk2_64 = shuffle64<4>(sPtr,sub, chunk2, src, src | 1);
-				const uint64_t cc2 = ax0_64 + chunk2_64;
-				storeGlobal32( (uint32_t*)((uint64_t)(long_state + j) ^ 0x30), ((uint32_t*)&cc2)[sub & 1]);
+				uint4 value;
+				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
+				if(sub == 1)
+					((uint64_t*)&value)[0] = tmp10;
+				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
+				if(sub == 1)
+					((uint64_t*)&value)[1] = tmp20;
+				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
+				if(sub == 2)
+					((uint64_t*)&value)[0] = tmp11;
+				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
+				if(sub == 2)
+					((uint64_t*)&value)[1] = tmp21;
+				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
+				if(sub == 3)
+					((uint64_t*)&value)[0] = tmp12;
+				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
+				if(sub == 3)
+					((uint64_t*)&value)[1] = tmp22;
+				if(sub > 0)
+				{
+					const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
+					uint4 store{};
+					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
+					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
+
+					const int dest = sub + 1;
+					const int dest2 = dest == 4 ? 1 : dest;
+					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
+				}
 			}
 			
 			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);

From df1a4200ec781b1bd8a1d53e5a4fc8c8329672f9 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 15 Sep 2018 22:43:21 +0200
Subject: [PATCH 23/77] OpenCL: optimize NVIDIA pass

Create a special pass for NVIDIA GPUs to load memory chunks first into the shared memory.

Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com>
---
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 53 ++++++++++++++++---
 1 file changed, 45 insertions(+), 8 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 778c8d5ba..9f474da87 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -418,6 +418,9 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
+)==="
+R"===(
+
 #define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
 
 #if(STRIDED_INDEX==0)
@@ -559,8 +562,14 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
-#define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
-		
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
+#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#else
+#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
+#endif
+	
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
@@ -575,6 +584,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(ALGO==11)		
 	ulong b[4];
 	uint4 b_x[2];
+// NVIDIA
+#	ifdef __NV_CL_C_VERSION
+	__local uint16 scratchpad_line_buf[WORKSIZE];
+ 	__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+#	endif
 #else
 	ulong b[2];
 	uint4 b_x[1];
@@ -661,6 +675,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+			ulong idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#endif
 
 			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 // cryptonight_bittube2
@@ -694,14 +713,24 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	endif
 			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
 			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
 // cryptonight_monero_v8
 #elif(ALGO==11)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
+#	ifdef __NV_CL_C_VERSION
+			// flush shuffeled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+ 			idx0 = c[0] & MASK;
+ 			idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#	else
+			idx0 = c[0] & MASK;
+#	endif
 #else
 			b_x[0] ^= ((uint4 *)c)[0];
 			SCRATCHPAD_CHUNK(0) = b_x[0];
-#endif
 			idx0 = c[0] & MASK;
+#endif
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
 // cryptonight_monero_v8
@@ -753,6 +782,16 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 
 			((uint4 *)a)[0] ^= tmp;
+
+// cryptonight_monero_v8
+#if (ALGO == 11)
+#	if defined(__NV_CL_C_VERSION)
+			// flush shuffeled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+#	endif
+			b_x[1] = b_x[0];
+#endif
+			b_x[0] = ((uint4 *)c)[0];
 			idx0 = a[0] & MASK;
 
 // cryptonight_heavy || cryptonight_bittube2
@@ -771,16 +810,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			idx0 = ((~d) ^ q) & MASK;
 #endif
 		
-// cryptonight_monero_v8
-#if (ALGO == 11)
-			b_x[1] = b_x[0];
-#endif
-			b_x[0] = ((uint4 *)c)[0];
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
+)==="
+R"===(
+
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {

From 28f41a6e8a4f562272f86c8de9b582e530a4221f Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 16 Sep 2018 20:38:10 +0200
Subject: [PATCH 24/77] AMD: add unroll option

add option `unroll` for OpenCL to allow better tuning the main POW kernel.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp               |  1 +
 xmrstak/backend/amd/amd_gpu/gpu.hpp               |  1 +
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl |  2 +-
 xmrstak/backend/amd/autoAdjust.hpp                |  2 +-
 xmrstak/backend/amd/config.tpl                    |  4 +++-
 xmrstak/backend/amd/jconf.cpp                     | 12 ++++++++++--
 xmrstak/backend/amd/jconf.hpp                     |  1 +
 xmrstak/backend/amd/minethd.cpp                   |  1 +
 8 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index bb39c5764..767e53855 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -405,6 +405,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
 		options += " -DMEMORY=" + std::to_string(hashMemSize);
 		options += " -DALGO=" + std::to_string(miner_algo[ii]);
+		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
 
 		/* create a hash for the compile time cache
 		 * used data:
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5ab80b82a..63c5029d7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -27,6 +27,7 @@ struct GpuContext
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
+	int unroll = 0;
 	bool isNVIDIA = false;
 	int compMode;
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 9f474da87..7d0ad1818 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -671,7 +671,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	{
 		ulong idx0 = a[0] & MASK;
 
-		#pragma unroll 8
+		#pragma unroll CN_UNROLL
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 4a2ffdb19..c5b331c87 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -172,7 +172,7 @@ class autoAdjust
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-					"    \"comp_mode\" : true\n" +
+					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
 					"  },\n";
 			}
 			else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 28855f070..0101b7e2f 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -13,13 +13,15 @@ R"===(
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
  *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * unroll        - allow to control how often the POW main loop is unrolled; valid range [0;128]
  * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, 
+ *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 9e15c930c..cd2486973 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	stridedIndex = GetObjectMember(oThdConf, "strided_index");
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
+	unroll = GetObjectMember(oThdConf, "unroll");
 	compMode = GetObjectMember(oThdConf, "comp_mode");
 
 	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
-		stridedIndex == nullptr || compMode == nullptr)
+		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -149,6 +150,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	}
 
 	cfg.memChunk = (int)memChunk->GetInt64();
+	
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 )
+	{
+		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128");
+		return false;
+	}
+	cfg.unroll = (int)unroll->GetInt64();
 
 	if(!compMode->IsBool())
 		return false;
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 580b69fe7..b852c5940 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -28,6 +28,7 @@ class jconf
 		long long cpu_aff;
 		int stridedIndex;
 		int memChunk;
+		int unroll;
 		bool compMode;
 	};
 
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index d6051ffcd..5ac246335 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -99,6 +99,7 @@ bool minethd::init_gpus()
 		vGpuData[i].stridedIndex = cfg.stridedIndex;
 		vGpuData[i].memChunk = cfg.memChunk;
 		vGpuData[i].compMode = cfg.compMode;
+		vGpuData[i].unroll = cfg.unroll;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;

From 2932de6951dc94a4cff7eec45b70dcda770b9bc9 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 16 Sep 2018 22:23:36 +0200
Subject: [PATCH 25/77] assembler version for cryptonight_v8

Add @SChernykh assembler version for ryzen and intel processors.

Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com>
---
 .../cpu/crypto/asm/cryptonigh_v8_main_loop.S  |  21 +++
 .../crypto/asm/cryptonigh_v8_main_loop.asm    |  18 ++
 .../asm/cryptonigh_v8_main_loop_ivybridge.inc | 176 ++++++++++++++++++
 .../asm/cryptonigh_v8_main_loop_ryzen.inc     | 174 +++++++++++++++++
 4 files changed, 389 insertions(+)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc

diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
new file mode 100644
index 000000000..cd747f7c5
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
@@ -0,0 +1,21 @@
+#define ALIGN .align
+.intel_syntax noprefix
+.section .text
+.global cryptonigh_v8_mainloop_ivybridge_asm
+.global cryptonigh_v8_mainloop_ryzen_asm
+
+ALIGN 64
+cryptonigh_v8_mainloop_ivybridge_asm:
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cryptonigh_v8_main_loop_ivybridge.inc"
+	add rsp, 48
+	ret 0
+
+ALIGN 64
+cryptonigh_v8_mainloop_ryzen_asm:
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cryptonigh_v8_main_loop_ryzen.inc"
+	add rsp, 48
+	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
new file mode 100644
index 000000000..2101a59ce
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
@@ -0,0 +1,18 @@
+_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cryptonigh_v8_mainloop_ivybridge_asm
+PUBLIC cryptonigh_v8_mainloop_ryzen_asm
+
+ALIGN 64
+cryptonigh_v8_mainloop_ivybridge_asm PROC
+	INCLUDE cryptonigh_v8_main_loop_ivybridge.inc
+	ret 0
+cryptonigh_v8_mainloop_ivybridge_asm ENDP
+
+ALIGN 64
+cryptonigh_v8_mainloop_ryzen_asm PROC
+	INCLUDE cryptonigh_v8_main_loop_ryzen.inc
+	ret 0
+cryptonigh_v8_mainloop_ryzen_asm ENDP
+
+_TEXT_CNV8_MAINLOOP ENDS
+END
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
new file mode 100644
index 000000000..ea7f799fd
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
@@ -0,0 +1,176 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+
+	ALIGN 64
+$main_loop_ivybridge:
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movq	 rcx, xmm3
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	mov	 rax, rcx
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	shl	 rax, 32
+	xor	 rdi, rax
+	movq	 rbp, xmm6
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	mov	 r10, rbp
+	and	 r10d, 2097136
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r10+rbx]
+	lea	 r14, QWORD PTR [r10+rbx]
+	mov	 r12, QWORD PTR [r10+rbx+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	movq	 rdx, xmm3
+	test	 rdx, 524287
+	je	 $sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+$sqrt_fixup_ivybridge_ret:
+
+	mov	 r9, r10
+	mov	 rax, rdi
+	mul	 rbp
+
+	xor	 r9, 16
+	mov	 rcx, r10
+	xor	 rcx, 32
+	xor	 r10, 48
+	add	 r8, rdx
+	add	 r11, rax
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov	 r10, r8
+	mov	 QWORD PTR [r14+8], r11
+	and	 r10d, 2097136
+	xor	 r11, r12
+	dec rsi
+	jne	 $main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp $cnv2_main_loop_ivybridge_endp
+
+$sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov	 r13, -4389456576512
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	mov r13, 4389456576511
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 $sqrt_fixup_ivybridge_ret
+
+$cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
new file mode 100644
index 000000000..5797f5497
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
@@ -0,0 +1,174 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 64
+$main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	$sqrt_fixup_ryzen
+	shr	rdi, 19
+
+$sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	xmm0, xmm4
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm0
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	$main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp $cnv2_main_loop_ryzen_endp
+
+$sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov	rdx, 4389456576511
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	sub	rcx, rdx
+	mov	rdx, -4389456576512
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	$sqrt_fixup_ryzen_ret
+
+$cnv2_main_loop_ryzen_endp:

From 0a9a9abaf7afceeb923292630b93b0fe4830efea Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 16 Sep 2018 22:24:56 +0200
Subject: [PATCH 26/77] infrastructure to load asm code

- add new option to `cpu.txt` named `asm` to select the asm code version
- extent function selection method to choose assembler code for `cryptonight_v8`
- update auto adjustment to add default value for option `asm`
---
 CMakeLists.txt                                | 11 +++++-
 xmrstak/backend/cpu/autoAdjust.hpp            |  2 +-
 xmrstak/backend/cpu/autoAdjustHwloc.hpp       |  2 +-
 xmrstak/backend/cpu/config.tpl                | 13 ++++---
 .../backend/cpu/crypto/cryptonight_aesni.h    | 21 +++++++++++
 xmrstak/backend/cpu/jconf.cpp                 |  7 +++-
 xmrstak/backend/cpu/jconf.hpp                 |  1 +
 xmrstak/backend/cpu/minethd.cpp               | 35 +++++++++++++++----
 xmrstak/backend/cpu/minethd.hpp               |  5 +--
 9 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a642b385d..067bbd0a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE)
 endif()
 target_link_libraries(xmr-stak-c ${LIBS})
 
+enable_language(ASM)
+# asm optimized monero v8 code
+add_library(xmr-stak-asm
+    STATIC
+    "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+)
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
+
 # compile generic backend files
 file(GLOB BACKEND_CPP
     "xmrstak/*.cpp"
@@ -472,7 +481,7 @@ add_library(xmr-stak-backend
     STATIC
     ${BACKEND_CPP}
 )
-target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
+target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)
 
 # compile CUDA backend
 if(CUDA_FOUND)
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 57dbef053..8588fea8c 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -82,7 +82,7 @@ class autoAdjust
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 01d2280d8..a73de8618 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ class autoAdjust
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true,  \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 2fc9a47ec..bfffc851e 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -7,10 +7,15 @@ R"===(
  *                  the maximum performance. When set to a number N greater than 1, this mode will increase the
  *                  cache usage and single thread performance by N times.
  *
- * no_prefetch -    Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen]
+ *                    - auto: used the default implementation (no assembler version)
+ *                    - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+ *                    - ryzen: AMD Ryzen (1xxx and 2xxx series)
+ *
+ * affine_to_cpu  - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
  *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting
  *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
  *                  physical core CPU you should select cpu numbers 0-3.
@@ -21,8 +26,8 @@ R"===(
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
  * [
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
  * ],
  * If you do not wish to mine with your CPU(s) then use:
  * "cpu_threads_conf" :
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 273476096..0ab47e390 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -876,3 +876,24 @@ struct Cryptonight_hash<5>
 		REPEAT_5(0, CN_FINALIZE);
 	}
 };
+
+extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+
+template<xmrstak_algo ALGO, int asm_version>
+void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+	constexpr size_t MEM = cn_select_memory<ALGO>();
+
+	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+	cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+
+	if (asm_version == 1)
+		cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]);
+	else
+		cryptonigh_v8_mainloop_ryzen_asm(ctx[0]);
+
+	cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+	keccakf((uint64_t*)ctx[0]->hash_state, 24);
+	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+}
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 49da7ae2d..1f9501c40 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *mode, *no_prefetch, *aff;
+	const Value *mode, *no_prefetch, *aff, *asm_version;
 	mode = GetObjectMember(oThdConf, "low_power_mode");
 	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	asm_version = GetObjectMember(oThdConf, "asm");
 
 	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
 		return false;
@@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.iCpuAff = -1;
 
+	if(!asm_version->IsString())
+		return false;
+	cfg.asm_version_str = asm_version->GetString();
+
 	return true;
 }
 
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index be855036e..4ec9165d5 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -24,6 +24,7 @@ class jconf
 	struct thd_cfg {
 		int iMultiway;
 		bool bNoPrefetch;
+		std::string asm_version_str;
 		long long iCpuAff;
 	};
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 87f4d3285..f07c71481 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 #endif
 }
 
-minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version)
 {
 	this->backendType = iBackend::CPU;
 	oWork = pWork;
@@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	iJobNo = 0;
 	bNoPrefetch = no_prefetch;
 	this->affinity = affinity;
+	asm_version_str = asm_version;
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -441,7 +442,7 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		else
 			printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
 
-		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str);
 		pvThreads.push_back(thd);
 	}
 
@@ -449,9 +450,31 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 }
 
 template<size_t N>
-minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
+	
+	// check for asm optimized version for cryptonight_v8
+	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
+	{
+		if(asm_version_str != "auto")
+		{
+			if(asm_version_str == "intel")
+			{
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>;
+			}
+			if(asm_version_str == "ryzen")
+			{
+				// AMD Ryzen (1xxx and 2xxx series)
+				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>;
+			}
+			else
+			{
+				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
+			}
+		}
+	}
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
@@ -636,7 +659,7 @@ void minethd::multiway_work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
@@ -671,12 +694,12 @@ void minethd::multiway_work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 26478542c..53ff93c15 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -32,9 +32,9 @@ class minethd : public iBackend
 private:
 
 	template<size_t N>
-	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto");
 
-	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
+	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
 	template<uint32_t N>
 	void multiway_work_main();
@@ -60,6 +60,7 @@ class minethd : public iBackend
 
 	bool bQuit;
 	bool bNoPrefetch;
+	std::string asm_version_str = "auto";
 };
 
 } // namespace cpu

From 0254553871ca33010d7cfe8cfe3d5b25b21cb013 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 16 Sep 2018 23:01:46 +0200
Subject: [PATCH 27/77] optimize single hash cryptonight_v8

If single hash is used the type of the variable to hold the intermediat sqrt value is
changed from `__m128i` to `uint64_t` as suggested by @SChernykh
---
 .../backend/cpu/crypto/cryptonight_aesni.h    | 88 ++++++++++++++-----
 1 file changed, 66 insertions(+), 22 deletions(-)

diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 0ab47e390..7c409d187 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -20,6 +20,7 @@
 #include <memory.h>
 #include <stdio.h>
 #include <cfenv>
+#include <utility>
 
 #ifdef __GNUC__
 #include <x86intrin.h>
@@ -423,7 +424,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	_mm_store_si128(output + 11, xout7);
 }
 
-inline __m128i int_sqrt33_1_double_precision(const uint64_t n0)
+inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
 {
 	__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
 	x = _mm_sqrt_sd(_mm_setzero_pd(), x);
@@ -441,7 +442,7 @@ inline __m128i int_sqrt33_1_double_precision(const uint64_t n0)
  	// Fallback to simpler code
  	if (x2 < n0) ++r;
 #endif
-	return _mm_cvtsi64_si128(r);
+	return r;
 }
 
 inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
@@ -489,6 +490,48 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 }
 
+/** optimal type for sqrt
+ *
+ * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected.
+ *
+ * @tparam N number of hashes per thread
+ */
+template<size_t N>
+struct GetOptimalSqrtType
+{
+	using type = __m128i;
+};
+
+template<>
+struct GetOptimalSqrtType<1u>
+{
+	using type = uint64_t;
+};
+template<size_t N>
+using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type;
+
+/** assign a value and convert if necessary
+ *
+ * @param output output type
+ * @param input value which is assigned to output
+ * @{
+ */
+inline void assign(__m128i& output, const uint64_t input)
+{
+	output = _mm_cvtsi64_si128(input);
+}
+
+inline void assign(uint64_t& output, const uint64_t input)
+{
+	output = input;
+}
+
+inline void assign(uint64_t& output, const __m128i& input)
+{
+	output = _mm_cvtsi128_si64(input);
+}
+/** @} */
+
 inline void set_float_rounding_mode()
 {
 #ifdef _MSC_VER
@@ -511,14 +554,15 @@ inline void set_float_rounding_mode()
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
 	}
 
-#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
 	if(ALGO == cryptonight_monero_v8) \
 	{ \
-		const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm)); \
+		uint64_t sqrt_result_tmp; \
+		assign(sqrt_result_tmp, sqrt_result); \
 		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
 		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
-		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \
-		const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
+		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
 		/* Most and least significant bits in the divisor are set to 1 \
 		 * to make sure we don't divide by a small or even number, \
 		 * so there are no shortcuts for such cases \
@@ -531,7 +575,7 @@ inline void set_float_rounding_mode()
 		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
 		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
 		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
-		sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \
+		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
 	}
 
 #define CN_INIT_SINGLE \
@@ -541,7 +585,7 @@ inline void set_float_rounding_mode()
 		return; \
 	}
 
-#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \
 	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
 	uint64_t monero_const; \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -559,7 +603,7 @@ inline void set_float_rounding_mode()
 	/* BEGIN cryptonight_monero_v8 variables */ \
 	__m128i bx1; \
 	__m128i division_result_xmm; \
-	__m128i sqrt_result_xmm; \
+	GetOptimalSqrtType_t<N> sqrt_result; \
 	/* END cryptonight_monero_v8 variables */ \
 	{ \
 		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
@@ -570,7 +614,7 @@ inline void set_float_rounding_mode()
 		{ \
 			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
 			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
-			sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \
+			assign(sqrt_result, h0[13]); \
 			set_float_rounding_mode(); \
 		} \
 	} \
@@ -606,13 +650,13 @@ inline void set_float_rounding_mode()
 	if(ALGO != cryptonight_monero_v8) \
 		bx0 = cx
 
-#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
 	uint64_t lo, cl, ch; \
 	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
 	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
 	cl = ((uint64_t*)ptr0)[0]; \
 	ch = ((uint64_t*)ptr0)[1]; \
-	CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \
+	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
 	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \
 	if(ALGO == cryptonight_monero_v8) \
 	{ \
@@ -745,14 +789,14 @@ struct Cryptonight_hash<1>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
+		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
 			REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
+			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
 			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -774,14 +818,14 @@ struct Cryptonight_hash<2>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
+		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
 			REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
+			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
 			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -803,14 +847,14 @@ struct Cryptonight_hash<3>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
+		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
 			REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
+			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
 			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -832,14 +876,14 @@ struct Cryptonight_hash<4>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
+		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
 			REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
+			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
 			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -861,14 +905,14 @@ struct Cryptonight_hash<5>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
+		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
 			REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
+			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
 			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}

From db70071534692198336e6a09d61323eb18f09bc8 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 17 Sep 2018 08:22:51 +0200
Subject: [PATCH 28/77] CPU: change default for `asm`

Remove the asm option `auto` by `off`
---
 xmrstak/backend/cpu/autoAdjust.hpp      | 2 +-
 xmrstak/backend/cpu/autoAdjustHwloc.hpp | 2 +-
 xmrstak/backend/cpu/config.tpl          | 8 ++++----
 xmrstak/backend/cpu/minethd.cpp         | 2 +-
 xmrstak/backend/cpu/minethd.hpp         | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 8588fea8c..28ff515d4 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -82,7 +82,7 @@ class autoAdjust
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index a73de8618..2bebf82d0 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ class autoAdjust
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true,  \"asm\" : \"auto\", \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true,  \"asm\" : \"off\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index bfffc851e..e4da15fad 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -10,8 +10,8 @@ R"===(
  * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen]
- *                    - auto: used the default implementation (no assembler version)
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [off, intel, ryzen]
+ *                    - off: used the default implementation (no assembler version)
  *                    - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
  *                    - ryzen: AMD Ryzen (1xxx and 2xxx series)
  *
@@ -26,8 +26,8 @@ R"===(
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
  * [
- *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 1 },
  * ],
  * If you do not wish to mine with your CPU(s) then use:
  * "cpu_threads_conf" :
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index f07c71481..2f01d5e90 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -457,7 +457,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	// check for asm optimized version for cryptonight_v8
 	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
 	{
-		if(asm_version_str != "auto")
+		if(asm_version_str != "off")
 		{
 			if(asm_version_str == "intel")
 			{
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 53ff93c15..eb77749f6 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -32,7 +32,7 @@ class minethd : public iBackend
 private:
 
 	template<size_t N>
-	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto");
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off");
 
 	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
@@ -60,7 +60,7 @@ class minethd : public iBackend
 
 	bool bQuit;
 	bool bNoPrefetch;
-	std::string asm_version_str = "auto";
+	std::string asm_version_str = "off";
 };
 
 } // namespace cpu

From 354c208569500c4c65a9dd5ed6ac442fea75113e Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 17 Sep 2018 08:25:12 +0200
Subject: [PATCH 29/77] fix compiler incompatibilities

- fix assembler code to pass the clang compiler
- CMake: set asm file language
- fix icc with gcc-7 compile issue with `_addcarry_u64`
---
 CMakeLists.txt                                                | 1 +
 .../cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc      | 4 ++--
 .../backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc  | 4 ++--
 xmrstak/backend/cpu/crypto/cryptonight_aesni.h                | 4 +++-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 067bbd0a2..cf439227f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -459,6 +459,7 @@ endif()
 target_link_libraries(xmr-stak-c ${LIBS})
 
 enable_language(ASM)
+set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C)
 # asm optimized monero v8 code
 add_library(xmr-stak-asm
     STATIC
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
index ea7f799fd..1cc20b35a 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
@@ -157,14 +157,14 @@ $sqrt_fixup_ivybridge_ret:
 
 $sqrt_fixup_ivybridge:
 	dec	 rdx
-	mov	 r13, -4389456576512
+	movq	 r13, -4389456576512
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	mov r13, 4389456576511
+	movq	 r13, 4389456576511
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
index 5797f5497..c564d8949 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
@@ -157,14 +157,14 @@ $sqrt_fixup_ryzen_ret:
 $sqrt_fixup_ryzen:
 	movq r9, xmm2
 	dec	rdi
-	mov	rdx, 4389456576511
+	movq	rdx, 4389456576511
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
 	sub	rcx, rdx
-	mov	rdx, -4389456576512
+	movq	rdx, -4389456576512
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 7c409d187..0838cfac4 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -435,7 +435,9 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
 
 	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
 
-#if defined _MSC_VER || (__GNUC__ >= 7)
+#ifdef __INTEL_COMPILER
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r);
+#elif defined(_MSC_VER) || (__GNUC__ >= 7)
 	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
 #else
 	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence

From 13fbb8a541db75484af7a457b2c892e7e0b5cbca Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 17 Sep 2018 09:16:06 +0200
Subject: [PATCH 30/77] asm compiler compatibility

- add special asm version for win64 and linux
- add cmake path for MSVC and other systems
---
 CMakeLists.txt                                |  40 ++--
 .../cpu/crypto/asm/cryptonigh_v8_main_loop.S  |  22 ++-
 .../crypto/asm/cryptonigh_v8_main_loop.asm    |   8 +-
 ...yptonigh_v8_main_loop_ivybridge_linux.inc} |  22 +--
 ...ryptonigh_v8_main_loop_ivybridge_win64.inc | 176 ++++++++++++++++++
 ...> cryptonigh_v8_main_loop_ryzen_linux.inc} |  22 +--
 .../cryptonigh_v8_main_loop_ryzen_win64.inc   | 174 +++++++++++++++++
 7 files changed, 415 insertions(+), 49 deletions(-)
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge.inc => cryptonigh_v8_main_loop_ivybridge_linux.inc} (91%)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen.inc => cryptonigh_v8_main_loop_ryzen_linux.inc} (92%)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf439227f..b51eb2ae4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -445,6 +445,26 @@ if(CMAKE_LINK_STATIC)
     endif()
 endif()
 
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    # asm optimized monero v8 code
+    enable_language(ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm"
+    )
+else()
+    # asm optimized monero v8 code
+    enable_language(ASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+    )
+endif()
+
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
 # compile C files
 file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c")
 
@@ -456,17 +476,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
 if(MICROHTTPD_ENABLE)
     target_link_libraries(xmr-stak-c ${MHTD})
 endif()
-target_link_libraries(xmr-stak-c ${LIBS})
-
-enable_language(ASM)
-set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY LANGUAGE C)
-# asm optimized monero v8 code
-add_library(xmr-stak-asm
-    STATIC
-    "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
-)
-set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
-
+target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm)
 
 # compile generic backend files
 file(GLOB BACKEND_CPP
@@ -509,7 +519,7 @@ if(CUDA_FOUND)
         )
     endif()
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
-    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile AMD backend
@@ -522,7 +532,7 @@ if(OpenCL_FOUND)
         ${OPENCLSRCFILES}
     )
     target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} )
-    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile final binary
@@ -538,7 +548,7 @@ endif()
 set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
 set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
+target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
 
 ################################################################################
 # Install
@@ -569,4 +579,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
 else()
     # this rule is used if the install prefix is the build directory
     install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")")
-endif()
+endif()
\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
index cd747f7c5..736dac7de 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
@@ -1,21 +1,27 @@
 #define ALIGN .align
 .intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
 .section .text
-.global cryptonigh_v8_mainloop_ivybridge_asm
-.global cryptonigh_v8_mainloop_ryzen_asm
+#endif
+.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm)
 
-ALIGN 64
-cryptonigh_v8_mainloop_ivybridge_asm:
+ALIGN 8
+FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cryptonigh_v8_main_loop_ivybridge.inc"
+        #include "cryptonigh_v8_main_loop_ivybridge_linux.inc"
 	add rsp, 48
 	ret 0
 
-ALIGN 64
-cryptonigh_v8_mainloop_ryzen_asm:
+ALIGN 8
+FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm):
 	sub rsp, 48
 	mov rcx, rdi
-	#include "cryptonigh_v8_main_loop_ryzen.inc"
+        #include "cryptonigh_v8_main_loop_ryzen_linux.inc"
 	add rsp, 48
 	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
index 2101a59ce..7f2d6a584 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
@@ -2,15 +2,15 @@ _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
 PUBLIC cryptonigh_v8_mainloop_ivybridge_asm
 PUBLIC cryptonigh_v8_mainloop_ryzen_asm
 
-ALIGN 64
+ALIGN 8
 cryptonigh_v8_mainloop_ivybridge_asm PROC
-	INCLUDE cryptonigh_v8_main_loop_ivybridge.inc
+        INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc
 	ret 0
 cryptonigh_v8_mainloop_ivybridge_asm ENDP
 
-ALIGN 64
+ALIGN 8
 cryptonigh_v8_mainloop_ryzen_asm PROC
-	INCLUDE cryptonigh_v8_main_loop_ryzen.inc
+        INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc
 	ret 0
 cryptonigh_v8_mainloop_ryzen_asm ENDP
 
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
similarity index 91%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
index 1cc20b35a..23f6cc060 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
@@ -49,8 +49,8 @@
 	movq	 xmm0, rcx
 	punpcklqdq xmm5, xmm0
 
-	ALIGN 64
-$main_loop_ivybridge:
+	ALIGN 8
+main_loop_ivybridge:
 	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
@@ -105,10 +105,10 @@ $main_loop_ivybridge:
 	sqrtsd	 xmm3, xmm0
 	movq	 rdx, xmm3
 	test	 rdx, 524287
-	je	 $sqrt_fixup_ivybridge
+	je	 sqrt_fixup_ivybridge
 	psrlq	 xmm3, 19
 	psubq	 xmm3, XMMWORD PTR [rsp+16]
-$sqrt_fixup_ivybridge_ret:
+sqrt_fixup_ivybridge_ret:
 
 	mov	 r9, r10
 	mov	 rax, rdi
@@ -138,7 +138,7 @@ $sqrt_fixup_ivybridge_ret:
 	and	 r10d, 2097136
 	xor	 r11, r12
 	dec rsi
-	jne	 $main_loop_ivybridge
+	jne	 main_loop_ivybridge
 
 	ldmxcsr DWORD PTR [rsp]
 	mov	 rbx, QWORD PTR [rsp+160]
@@ -153,24 +153,24 @@ $sqrt_fixup_ivybridge_ret:
 	pop	 rdi
 	pop	 rsi
 	pop	 rbp
-	jmp $cnv2_main_loop_ivybridge_endp
+	jmp cnv2_main_loop_ivybridge_endp
 
-$sqrt_fixup_ivybridge:
+sqrt_fixup_ivybridge:
 	dec	 rdx
-	movq	 r13, -4389456576512
+	movq r13, -4389456576512
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	movq	 r13, 4389456576511
+	movq r13, 4389456576511
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
 	sub	 rcx, r9
 	adc	 rdx, 0
 	movq	 xmm3, rdx
-	jmp	 $sqrt_fixup_ivybridge_ret
+	jmp	 sqrt_fixup_ivybridge_ret
 
-$cnv2_main_loop_ivybridge_endp:
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
new file mode 100644
index 000000000..ee7f31716
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
@@ -0,0 +1,176 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+
+	ALIGN 8
+main_loop_ivybridge:
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movd	 rcx, xmm3
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	mov	 rax, rcx
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	shl	 rax, 32
+	xor	 rdi, rax
+	movd	 rbp, xmm6
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	mov	 r10, rbp
+	and	 r10d, 2097136
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r10+rbx]
+	lea	 r14, QWORD PTR [r10+rbx]
+	mov	 r12, QWORD PTR [r10+rbx+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	movd	 rdx, xmm3
+	test	 rdx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 r9, r10
+	mov	 rax, rdi
+	mul	 rbp
+
+	xor	 r9, 16
+	mov	 rcx, r10
+	xor	 rcx, 32
+	xor	 r10, 48
+	add	 r8, rdx
+	add	 r11, rax
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov	 r10, r8
+	mov	 QWORD PTR [r14+8], r11
+	and	 r10d, 2097136
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov  r13, -4389456576512
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	mov  r13, 4389456576511
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
similarity index 92%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
index c564d8949..551ee8573 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
@@ -45,8 +45,8 @@
 	movq	xmm0, rcx
 	punpcklqdq xmm4, xmm0
 
-	ALIGN 64
-$main_loop_ryzen:
+	ALIGN 8
+main_loop_ryzen:
 	movdqa	xmm5, XMMWORD PTR [r10+rbx]
 	movq	xmm0, r11
 	movq	xmm6, r8
@@ -103,10 +103,10 @@ $main_loop_ryzen:
 	sqrtsd	xmm1, xmm0
 	movq	rdi, xmm1
 	test	rdi, 524287
-	je	$sqrt_fixup_ryzen
+	je	sqrt_fixup_ryzen
 	shr	rdi, 19
 
-$sqrt_fixup_ryzen_ret:
+sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
 
@@ -136,7 +136,7 @@ $sqrt_fixup_ryzen_ret:
 	and	r10d, 2097136
 	movdqa	xmm3, xmm5
 	dec	ebp
-	jne	$main_loop_ryzen
+	jne	main_loop_ryzen
 
 	ldmxcsr DWORD PTR [rsp]
 	movaps	xmm6, XMMWORD PTR [rsp+48]
@@ -152,23 +152,23 @@ $sqrt_fixup_ryzen_ret:
 	pop	r13
 	pop	r12
 	pop	rdi
-	jmp $cnv2_main_loop_ryzen_endp
+	jmp cnv2_main_loop_ryzen_endp
 
-$sqrt_fixup_ryzen:
+sqrt_fixup_ryzen:
 	movq r9, xmm2
 	dec	rdi
-	movq	rdx, 4389456576511
+	movq rdx, 4389456576511
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
 	sub	rcx, rdx
-	movq	rdx, -4389456576512
+	movq rdx, -4389456576512
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9
 	adc	rdi, 0
-	jmp	$sqrt_fixup_ryzen_ret
+	jmp	sqrt_fixup_ryzen_ret
 
-$cnv2_main_loop_ryzen_endp:
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
new file mode 100644
index 000000000..f70dccef8
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
@@ -0,0 +1,174 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd	xmm0, r11
+	movd	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movd	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movd	rax, xmm0
+
+	div	r9
+	movd xmm0, rax
+	movd xmm1, rdx
+	punpckldq xmm0, xmm1
+	movd r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	xmm0, xmm4
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm0
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movd r9, xmm2
+	dec	rdi
+	mov rdx, 4389456576511
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	sub	rcx, rdx
+	mov rdx, -4389456576512
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:

From ce84244ac99e0d4714150d52a6ec12a5a5f80621 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Tue, 18 Sep 2018 20:53:20 +0200
Subject: [PATCH 31/77] fix segfault if option `asm` is not in `cpu.txt`

---
 xmrstak/backend/cpu/jconf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 1f9501c40..a14be1732 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -114,7 +114,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	asm_version = GetObjectMember(oThdConf, "asm");
 
-	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
+	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr)
 		return false;
 
 	if(!mode->IsBool() && !mode->IsNumber())

From 9a2ef075264942829e580af613b8ac0d3f8831d2 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Tue, 18 Sep 2018 20:57:18 +0200
Subject: [PATCH 32/77] update docs and reintroduce monero7

- reintroduce monero7 until the POW is final
- update docs (add cryptonigh_v8)
---
 README.md                      | 1 +
 xmrstak/backend/amd/config.tpl | 1 +
 xmrstak/jconf.cpp              | 1 +
 xmrstak/pools.tpl              | 4 +++-
 4 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e3b01328a..2fe1bc511 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_masari
     - cryptonight_v7
     - cryptonight_v7_stellite
+    - cryptonight_v8
 - 4MiB scratchpad memory
     - cryptonight_haven
     - cryptonight_heavy
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 0101b7e2f..63106bcb9 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -9,6 +9,7 @@ R"===(
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ *                             (not allowed for cryptonight_v8 ans monero8)
  *                 0 or false = use a contiguous block of memory per thread
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 609b55f72..c69d47ab8 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -105,6 +105,7 @@ xmrstak::coin_selection coins[] = {
 	{ "haven",               {cryptonight_haven, cryptonight_heavy, 3u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "intense",             {cryptonight_monero, cryptonight, 4u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "masari",              {cryptonight_masari, cryptonight_monero, 7u},   {cryptonight_monero, cryptonight_monero, 0u},nullptr },
+	{ "monero7",             {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" },
 	{ "monero8",             {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
 	{ "qrl",             	 {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 78f2315ac..9c3dd5a59 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -27,7 +27,8 @@ POOLCONF],
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    intense
  *    masari
- *    monero7 (use this for Monero's new PoW)
+ *    monero7
+ *    monero8 (use this to support Monero's Oct 2018 fork)
  *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin
@@ -41,6 +42,7 @@ POOLCONF],
  *    # 2MiB scratchpad memory
  *    cryptonight
  *    cryptonight_v7
+ *    cryptonight_v8
  *    # 4MiB scratchpad memory
  *    cyrptonight_bittube2
  *    cryptonight_haven

From 78bd54ff2f63a65f5c01848160f08324d04ea2d3 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Tue, 18 Sep 2018 21:09:05 +0200
Subject: [PATCH 33/77] fix naming `cryptonigh` to `cryptonight`

---
 CMakeLists.txt                                |  8 +++---
 .../cpu/crypto/asm/cryptonigh_v8_main_loop.S  | 27 -------------------
 .../crypto/asm/cryptonigh_v8_main_loop.asm    | 18 -------------
 .../cpu/crypto/asm/cryptonight_v8_main_loop.S | 27 +++++++++++++++++++
 .../crypto/asm/cryptonight_v8_main_loop.asm   | 18 +++++++++++++
 ...ptonight_v8_main_loop_ivybridge_linux.inc} |  0
 ...ptonight_v8_main_loop_ivybridge_win64.inc} |  0
 ... cryptonight_v8_main_loop_ryzen_linux.inc} |  0
 ... cryptonight_v8_main_loop_ryzen_win64.inc} |  0
 .../backend/cpu/crypto/cryptonight_aesni.h    |  8 +++---
 10 files changed, 53 insertions(+), 53 deletions(-)
 delete mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
 delete mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge_linux.inc => cryptonight_v8_main_loop_ivybridge_linux.inc} (100%)
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ivybridge_win64.inc => cryptonight_v8_main_loop_ivybridge_win64.inc} (100%)
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen_linux.inc => cryptonight_v8_main_loop_ryzen_linux.inc} (100%)
 rename xmrstak/backend/cpu/crypto/asm/{cryptonigh_v8_main_loop_ryzen_win64.inc => cryptonight_v8_main_loop_ryzen_win64.inc} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b51eb2ae4..eec03df9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,18 +448,18 @@ endif()
 if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
     # asm optimized monero v8 code
     enable_language(ASM_MASM)
-    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm" PROPERTY ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
     add_library(xmr-stak-asm
         STATIC
-        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm"
+        "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
     )
 else()
     # asm optimized monero v8 code
     enable_language(ASM)
-    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" PROPERTY C)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY C)
     add_library(xmr-stak-asm
         STATIC
-        "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+        "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
     )
 endif()
 
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
deleted file mode 100644
index 736dac7de..000000000
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S
+++ /dev/null
@@ -1,27 +0,0 @@
-#define ALIGN .align
-.intel_syntax noprefix
-#ifdef __APPLE__
-#   define FN_PREFIX(fn) _ ## fn
-.text
-#else
-#   define FN_PREFIX(fn) fn
-.section .text
-#endif
-.global FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm)
-.global FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm)
-
-ALIGN 8
-FN_PREFIX(cryptonigh_v8_mainloop_ivybridge_asm):
-	sub rsp, 48
-	mov rcx, rdi
-        #include "cryptonigh_v8_main_loop_ivybridge_linux.inc"
-	add rsp, 48
-	ret 0
-
-ALIGN 8
-FN_PREFIX(cryptonigh_v8_mainloop_ryzen_asm):
-	sub rsp, 48
-	mov rcx, rdi
-        #include "cryptonigh_v8_main_loop_ryzen_linux.inc"
-	add rsp, 48
-	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
deleted file mode 100644
index 7f2d6a584..000000000
--- a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.asm
+++ /dev/null
@@ -1,18 +0,0 @@
-_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
-PUBLIC cryptonigh_v8_mainloop_ivybridge_asm
-PUBLIC cryptonigh_v8_mainloop_ryzen_asm
-
-ALIGN 8
-cryptonigh_v8_mainloop_ivybridge_asm PROC
-        INCLUDE cryptonigh_v8_main_loop_ivybridge_win64.inc
-	ret 0
-cryptonigh_v8_mainloop_ivybridge_asm ENDP
-
-ALIGN 8
-cryptonigh_v8_mainloop_ryzen_asm PROC
-        INCLUDE cryptonigh_v8_main_loop_ryzen_win64.inc
-	ret 0
-cryptonigh_v8_mainloop_ryzen_asm ENDP
-
-_TEXT_CNV8_MAINLOOP ENDS
-END
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
new file mode 100644
index 000000000..3aa8994dd
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -0,0 +1,27 @@
+#define ALIGN .align
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm)
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+        #include "cryptonight_v8_main_loop_ivybridge_linux.inc"
+	add rsp, 48
+	ret 0
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+        #include "cryptonight_v8_main_loop_ryzen_linux.inc"
+	add rsp, 48
+	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
new file mode 100644
index 000000000..3c2bba619
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -0,0 +1,18 @@
+_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cryptonight_v8_mainloop_ivybridge_asm
+PUBLIC cryptonight_v8_mainloop_ryzen_asm
+
+ALIGN 8
+cryptonight_v8_mainloop_ivybridge_asm PROC
+        INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc
+	ret 0
+cryptonight_v8_mainloop_ivybridge_asm ENDP
+
+ALIGN 8
+cryptonight_v8_mainloop_ryzen_asm PROC
+        INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc
+	ret 0
+cryptonight_v8_mainloop_ryzen_asm ENDP
+
+_TEXT_CNV8_MAINLOOP ENDS
+END
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
similarity index 100%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_linux.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
similarity index 100%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ivybridge_win64.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
similarity index 100%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_linux.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
similarity index 100%
rename from xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop_ryzen_win64.inc
rename to xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 0838cfac4..844e4c045 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -923,8 +923,8 @@ struct Cryptonight_hash<5>
 	}
 };
 
-extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
-extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
 
 template<xmrstak_algo ALGO, int asm_version>
 void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
@@ -935,9 +935,9 @@ void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, crypto
 	cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
 
 	if (asm_version == 1)
-		cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]);
+		cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
 	else
-		cryptonigh_v8_mainloop_ryzen_asm(ctx[0]);
+		cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
 
 	cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
 	keccakf((uint64_t*)ctx[0]->hash_state, 24);

From 1692c543c6be416f5b6b14e1501c880e62ee5fe6 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 19 Sep 2018 18:05:47 +0200
Subject: [PATCH 34/77] asm, style and spelling fixes

- fix code style issues
- fix spelling issue
- fix asm to support newer clang versions
---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl           | 4 ++--
 xmrstak/backend/amd/config.tpl                              | 4 ++--
 xmrstak/backend/amd/jconf.cpp                               | 4 ++--
 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S   | 4 ++--
 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm | 4 ++--
 .../crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc | 5 +++--
 .../cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc | 6 +++---
 xmrstak/backend/cpu/crypto/cryptonight_aesni.h              | 6 +++---
 xmrstak/backend/cpu/minethd.cpp                             | 2 +-
 9 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 7d0ad1818..286bc39b6 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -718,7 +718,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #elif(ALGO==11)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
-			// flush shuffeled data
+			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
  			idx0 = c[0] & MASK;
  			idxS = idx0 & 0x30;
@@ -786,7 +786,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 // cryptonight_monero_v8
 #if (ALGO == 11)
 #	if defined(__NV_CL_C_VERSION)
-			// flush shuffeled data
+			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
 #	endif
 			b_x[1] = b_x[0];
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 63106bcb9..043b05355 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -9,12 +9,12 @@ R"===(
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
- *                             (not allowed for cryptonight_v8 ans monero8)
+ *                             (not allowed for cryptonight_v8 and monero8)
  *                 0 or false = use a contiguous block of memory per thread
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
  *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
- * unroll        - allow to control how often the POW main loop is unrolled; valid range [0;128]
+ * unroll        - allow to control how often the POW main loop is unrolled; valid range [0;128) - for most OpenCL implementations it must be a power of two.
  * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index cd2486973..777dbdbb5 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -151,9 +151,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 
 	cfg.memChunk = (int)memChunk->GetInt64();
 	
-	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 )
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 ||  )
 	{
-		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128");
+		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two");
 		return false;
 	}
 	cfg.unroll = (int)unroll->GetInt64();
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
index 3aa8994dd..b6be9438f 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -14,7 +14,7 @@ ALIGN 8
 FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
 	sub rsp, 48
 	mov rcx, rdi
-        #include "cryptonight_v8_main_loop_ivybridge_linux.inc"
+	#include "cryptonight_v8_main_loop_ivybridge_linux.inc"
 	add rsp, 48
 	ret 0
 
@@ -22,6 +22,6 @@ ALIGN 8
 FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
 	sub rsp, 48
 	mov rcx, rdi
-        #include "cryptonight_v8_main_loop_ryzen_linux.inc"
+	#include "cryptonight_v8_main_loop_ryzen_linux.inc"
 	add rsp, 48
 	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
index 3c2bba619..a1615e9bd 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -4,13 +4,13 @@ PUBLIC cryptonight_v8_mainloop_ryzen_asm
 
 ALIGN 8
 cryptonight_v8_mainloop_ivybridge_asm PROC
-        INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc
+	INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc
 	ret 0
 cryptonight_v8_mainloop_ivybridge_asm ENDP
 
 ALIGN 8
 cryptonight_v8_mainloop_ryzen_asm PROC
-        INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc
+	INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc
 	ret 0
 cryptonight_v8_mainloop_ryzen_asm ENDP
 
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
index 23f6cc060..21f1f48c3 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -157,14 +157,15 @@ sqrt_fixup_ivybridge_ret:
 
 sqrt_fixup_ivybridge:
 	dec	 rdx
-	movq r13, -4389456576512
+	mov	r13d, -1022
+ 	shl	r13, 32
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	movq r13, 4389456576511
+	not	r13
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
index 551ee8573..9c177b85a 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -157,14 +157,14 @@ sqrt_fixup_ryzen_ret:
 sqrt_fixup_ryzen:
 	movq r9, xmm2
 	dec	rdi
-	movq rdx, 4389456576511
+	mov	edx, -1022
+ 	shl	rdx, 32
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
-	sub	rcx, rdx
-	movq rdx, -4389456576512
+	lea	rcx, [rcx+rdx+1]
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 844e4c045..6edae905e 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -441,8 +441,8 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
 	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
 #else
 	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
- 	// Fallback to simpler code
- 	if (x2 < n0) ++r;
+	// Fallback to simpler code
+	if (x2 < n0) ++r;
 #endif
 	return r;
 }
@@ -733,7 +733,7 @@ inline void set_float_rounding_mode()
 /** add append n to all arguments and keeps n as first argument
  *
  * @param n number which is appended to the arguments (expect the first argument n)
- * 
+ *
  * @code{.cpp}
  * CN_ENUM_2(1, foo, bar)
  * // is transformed to
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 2f01d5e90..05743ae92 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -453,7 +453,7 @@ template<size_t N>
 minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
-	
+
 	// check for asm optimized version for cryptonight_v8
 	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
 	{

From ac56ecbde8d19c5bc6ab8b76ab9dfebab1b7eb85 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Wed, 19 Sep 2018 20:21:24 +0200
Subject: [PATCH 35/77] cuda fast math for Monero pow v8

Add fast version for div and sqrt for the cuda backend
---
 .../nvcc_code/cuda_fast_int_math_v2.hpp       | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
new file mode 100644
index 000000000..41ec70e1c
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <stdint.h>
+
+static __constant__ const uint32_t RCP_C[256] =
+{
+	0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu,
+	0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au,
+	0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu,
+	0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u,
+	0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u,
+	0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u,
+	0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u,
+	0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u,
+	0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u,
+	0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u,
+	0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u,
+	0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u,
+	0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au,
+	0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u,
+	0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u,
+	0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu,
+	0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u,
+	0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u,
+	0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u,
+	0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u,
+	0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu,
+	0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u,
+	0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u,
+	0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u,
+	0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u,
+	0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu,
+	0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau,
+	0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u,
+	0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u,
+	0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu,
+	0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u,
+	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
+};
+
+__device__ __forceinline__ uint32_t get_reciprocal(const uint32_t* RCP, uint32_t a)
+{
+	const uint32_t index1 = (a & 0x7F000000U) >> 23;
+	const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768;
+
+	const uint32_t r1 = RCP[index1];
+	uint32_t r2_0 = RCP[index1 + 1];
+	if (index2 > 0) r2_0 >>= 16;
+	const int r2 = r2_0 & 0xFFFFU;
+
+	const uint32_t r = r1 - (uint32_t)(__mul24(r2, index2) >> 6);
+
+	const uint64_t lo0 = (uint64_t)(r) * a;
+	uint64_t lo = lo0 + ((uint64_t)(a) << 32);
+
+	a >>= 1;
+	const bool b = (a >= lo) || (lo >= lo0);
+	lo = a - lo;
+
+	const uint64_t k = __umulhi((uint32_t)lo, r) + ((uint64_t)(r) * ((uint32_t*)&lo)[1]) + lo;
+	return ((uint32_t*)&k)[1] + (b ? r : 0);
+}
+
+__device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, uint32_t b)
+{
+	const uint32_t r = get_reciprocal(RCP, b);
+	const uint64_t k = __umulhi((uint32_t)a, r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
+
+	uint32_t q[2];
+	q[0] = ((uint32_t*)&k)[1];
+	q[1] = (k < a) ? 1 : 0;
+
+	const int64_t tmp = a - *((uint64_t*)(q)) * b;
+	const bool overshoot = (tmp < 0);
+	const bool undershoot = (tmp >= b);
+
+	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
+	q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U);
+
+	return *((uint64_t*)(q));
+}
+
+__device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
+{
+	float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23));
+	float x1;
+	asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x));
+	asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x));
+
+	// The following line does x1 *= 4294967296.0f;
+	x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23));
+
+	const uint32_t x0 = __float_as_uint(x) - (158U << 23);
+	const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18);
+	const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1;
+
+	uint32_t result = (x0 << 10) + __float2int_rn(delta);
+	const uint32_t s = result >> 1;
+	const uint32_t b = result & 1;
+
+	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
+	if ((int64_t)(x2 + b) > 0) --result;
+	if ((int64_t)(x2 + 0x100000000UL + s) < 0) ++result;
+
+	return result;
+}

From 659918f26bf07a49059417735f02626545ca1f36 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 19 Sep 2018 20:50:32 +0200
Subject: [PATCH 36/77] NVIDIA: optimize div and sqrt

- use optimzed div and sqrt
- reduce memory footprint
---
 xmrstak/backend/amd/jconf.cpp                 |  2 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 62 ++++++-------------
 2 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 777dbdbb5..fb1a04b4c 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -151,7 +151,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 
 	cfg.memChunk = (int)memChunk->GetInt64();
 	
-	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 ||  )
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128)
 	{
 		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two");
 		return false;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 1273f89e9..4e34e75a9 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -7,6 +7,8 @@
 #include <cuda_runtime.h>
 
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+
 
 #ifdef _WIN32
 #include <windows.h>
@@ -203,22 +205,6 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3
 	return tmp;
 }
 
-__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0)
-{
-	uint64_t x = (n0 >> 12) + (1023ULL << 52);
-	const double xx = sqrt( *reinterpret_cast<double*>(&x) );
-	uint64_t r = *reinterpret_cast<const uint64_t*>(&xx);
-
-	const uint64_t s = r >> 20;
-	r >>= 19;
-
-	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
-
- 	if (x2 < n0) ++r;
-
-	return r;
-}
-
 template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
@@ -229,6 +215,12 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	__shared__ uint32_t sharedMemory[1024];
 
 	cn_aes_gpu_init( sharedMemory );
+	__shared__ uint32_t RCP[256];
+	for (int i = threadIdx.x; i < 256; i += blockDim.x)
+	{
+		RCP[i] = RCP_C[i];
+	}
+
 
 	__syncthreads( );
 
@@ -284,7 +276,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 		// must be valid only for `sub < 2`
 		division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2];
-		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2];
+		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
 	}
 	else
 		d[1] = (d_ctx_b + thread * 4)[sub];
@@ -421,39 +413,23 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 			if(ALGO == cryptonight_monero_v8 )
 			{
-				const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1);
-
 				// Use division and square root results from the _previous_ iteration to hide the latency
 				const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);
 
-
-				const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
-				const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32);
-
+				uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
+				((uint32_t*)&division_result_64)[1] ^= sqrt_result;
+		
 				if(sub < 2)
-					*((uint64_t*)yy) ^= cl_rhs;
-
-
-				const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL;
+					*((uint64_t*)yy) ^= division_result_64;
 
-				// Most and least significant bits in the divisor are set to 1
-				// to make sure we don't divide by a small or even number,
-				// so there are no shortcuts for such cases
-				//
-				// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
-				// We drop the highest bit to fit both quotient and remainder in 32 bits
-
-				// Compiler will optimize it to a single div instruction
+				const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL;
 				const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
-
-
-				const uint64_t division_result_tmp = static_cast<uint32_t>(cx1 / dd) + ((cx1 % dd) << 32);
+				const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd);
 
 				division_result = ((uint32_t*)&division_result_tmp)[sub % 2];
 								
 				// Use division_result as an input for the square root to prevent parallel implementation in hardware
-				const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp);
-				sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2];
+				sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp);
 			}
 
 			uint32_t zz[2];
@@ -706,7 +682,6 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
-
 	if(miner_algo == cryptonight_monero)
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce);
@@ -745,11 +720,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	}
 	else if(miner_algo == cryptonight_haven)
 	{
-	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce);
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce);
 	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
-	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
 	}
-
 }

From fd27561be68abaf435bd1296eb9d35f7e790e57c Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 19 Sep 2018 21:35:35 +0200
Subject: [PATCH 37/77] NVIDIA: optimze v8

- fix that shared memory for fast div is always used even if an algorithm is not using it
- optimize fast div algo
- store `division_result` (64_bit) per thread instead of shuffle around and store it as 32bit
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 30 ++++++++++---------
 .../nvcc_code/cuda_fast_int_math_v2.hpp       |  8 ++---
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 4e34e75a9..563814702 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -215,10 +215,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 	__shared__ uint32_t sharedMemory[1024];
 
 	cn_aes_gpu_init( sharedMemory );
-	__shared__ uint32_t RCP[256];
-	for (int i = threadIdx.x; i < 256; i += blockDim.x)
+	uint32_t* RCP;
+	if(ALGO == cryptonight_monero_v8)
 	{
-		RCP[i] = RCP_C[i];
+		__shared__ uint32_t RCP_shared[256];
+		for (int i = threadIdx.x; i < 256; i += blockDim.x)
+		{
+			RCP_shared[i] = RCP_C[i];
+		}
+		RCP = RCP_shared;
 	}
 
 
@@ -268,14 +273,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		}
 	}
 
-	uint32_t bx1, division_result, sqrt_result;
+	uint32_t bx1, sqrt_result;
+	uint64_t division_result;
 	if(ALGO == cryptonight_monero_v8)
 	{
 		d[1] = (d_ctx_b + thread * 12)[sub];
 		bx1 = (d_ctx_b + thread * 12 + 4)[sub];
 
 		// must be valid only for `sub < 2`
-		division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2];
+		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
 		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
 	}
 	else
@@ -415,21 +421,17 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			{
 				// Use division and square root results from the _previous_ iteration to hide the latency
 				const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);
-
-				uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
-				((uint32_t*)&division_result_64)[1] ^= sqrt_result;
+				((uint32_t*)&division_result)[1] ^= sqrt_result;
 		
 				if(sub < 2)
-					*((uint64_t*)yy) ^= division_result_64;
+					*((uint64_t*)yy) ^= division_result;
 
 				const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL;
 				const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
-				const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd);
-
-				division_result = ((uint32_t*)&division_result_tmp)[sub % 2];
-								
+				division_result = fast_div_v2(RCP, cx1, dd);
+			
 				// Use division_result as an input for the square root to prevent parallel implementation in hardware
-				sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp);
+				sqrt_result = fast_sqrt_v2(cx0 + division_result);
 			}
 
 			uint32_t zz[2];
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index 41ec70e1c..2a25a9c07 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -71,11 +71,11 @@ __device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a,
 	q[1] = (k < a) ? 1 : 0;
 
 	const int64_t tmp = a - *((uint64_t*)(q)) * b;
-	const bool overshoot = (tmp < 0);
-	const bool undershoot = (tmp >= b);
+	const uint32_t overshoot = (tmp < 0) ? 1u : 0U;
+	const uint32_t undershoot = (tmp >= b) ? 1u : 0U;
 
-	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
-	q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U);
+	q[0] += undershoot - overshoot;
+	q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U);
 
 	return *((uint64_t*)(q));
 }

From 2818a4481eb23d0971974879b09707d07724942a Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 21 Sep 2018 20:53:31 +0200
Subject: [PATCH 38/77] NVIDIA: sqrt optimization cryptonight_v8

Avoid branche differegence
---
 xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index 2a25a9c07..e3220230a 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -99,8 +99,9 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
 	const uint32_t b = result & 1;
 
 	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
-	if ((int64_t)(x2 + b) > 0) --result;
-	if ((int64_t)(x2 + 0x100000000UL + s) < 0) ++result;
+	const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0;
+	const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0;
+	result += (overshoot+undershoot);
 
 	return result;
 }

From fce822e5f094d8bde9d0c3f3745d91129506ded0 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 21 Sep 2018 20:55:56 +0200
Subject: [PATCH 39/77] AMD: remove unused functions

- remove unused host function (relict from old refactoring)
- remove unused OpenCL full div function
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp           | 21 --------------
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    | 28 -------------------
 2 files changed, 49 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 767e53855..e2c2dfeb8 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -611,27 +611,6 @@ const char* const attributeNames[] = {
 
 #define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))
 
-void PrintDeviceInfo(cl_device_id device)
-{
-	char queryBuffer[1024];
-	int queryInt;
-	cl_int clError;
-	clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_NAME: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_VENDOR: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DRIVER_VERSION: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_VERSION: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
-	printf("    CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
-}
-
 uint32_t getNumPlatforms()
 {
 	cl_uint num_platforms = 0;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index fe7cea1ee..607806b7a 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -81,34 +81,6 @@ inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
 	);
 }
 
-inline void fast_div_full_q(const __local uint *RCP, ulong a, uint b, ulong *q, uint *r)
-{
-	const uint rcp = get_reciprocal((const __local uchar *)RCP, b);
-	const ulong k = mul_hi(as_uint2(a).s0, rcp) + ((ulong)(as_uint2(a).s1) * rcp) + a;
-
-	((uint*)q)[0] = as_uint2(k).s1;
-	((uint*)q)[1] = (k < a) ? 1 : 0;
-
-	long tmp = a - (*q) * b;
-
-	const bool overshoot = (tmp < 0);
-	const bool undershoot = (tmp >= b);
-
-	if (overshoot)
-	{
-		--(*q);
-		tmp += b;
-	}
-
-	if (undershoot)
-	{
-		++(*q);
-		tmp -= b;
-	}
-
-	*r = tmp;
-}
-
 inline uint fast_sqrt_v2(const ulong n1)
 {
 	float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));

From e94296672a56afd2497bebc8fc1de9b2557bc7d5 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Sat, 22 Sep 2018 06:59:28 -0600
Subject: [PATCH 40/77] configEditor: add version tagging, line filtering
 per-platform

---
 xmrstak/backend/amd/config.tpl    |  4 +-
 xmrstak/backend/cpu/config.tpl    |  4 +-
 xmrstak/backend/nvidia/config.tpl |  4 +-
 xmrstak/config.tpl                | 73 +++++++++++++++++--------------
 xmrstak/misc/configEditor.hpp     | 19 ++++++++
 xmrstak/pools.tpl                 |  5 +--
 6 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 043b05355..18ef8c696 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
  * index         - GPU index number usually starts from 0
@@ -37,5 +38,4 @@ GPUCONFIG
  * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
  */
 "platform_index" : PLATFORMINDEX,
-
 )==="
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index e4da15fad..37158d6e2 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * Thread configuration for each thread. Make sure it matches the number above.
  * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
@@ -38,5 +39,4 @@ R"===(
 [
 CPUCONFIG
 ],
-
 )==="
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 2aa68dc46..144da80b9 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * GPU configuration. You should play around with threads and blocks as the fastest settings will vary.
  * index         - GPU index number usually starts from 0.
@@ -35,5 +36,4 @@ R"===(
 [
 GPUCONFIG
 ],
-
 )==="
diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index 14330a829..deb52aa09 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * Network timeouts.
  * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make
@@ -58,43 +59,53 @@ R"===(
  * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration,
  * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is
  * meant as a backup, you won't get stellar results there. If you are running into trouble, especially
- * on Windows, please read the common issues in the README.
- *
- * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.
- * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN
- *
- * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.
- * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.
- * 3. Expand Security Settings, and then expand Local Policies.
- * 4. Select the User Rights Assignment folder.
- * 5. The policies will be displayed in the details pane.
- * 6. In the pane, double-click Lock pages in memory.
- * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.
- * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on
- * 9. Reboot for change to take effect.
- *
- * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need
- * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.
- *
- * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your
- * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144"
- * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT
- * recommended for security reasons.
- *
- * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a
- * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between
- * locked and unlocked memory. If that is your setup see option "no_mlck".
+ * on Windows, please read the common issues in the README and FAQ.
+ *
+ * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.---WINDOWS
+ * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN---WINDOWS
+ *---WINDOWS
+ * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.---WINDOWS
+ * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.---WINDOWS
+ * 3. Expand Security Settings, and then expand Local Policies.---WINDOWS
+ * 4. Select the User Rights Assignment folder.---WINDOWS
+ * 5. The policies will be displayed in the details pane.---WINDOWS
+ * 6. In the pane, double-click Lock pages in memory.---WINDOWS
+ * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.---WINDOWS
+ * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on---WINDOWS
+ * 9. Reboot for change to take effect.---WINDOWS
+ *---WINDOWS
+ * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need---WINDOWS
+ * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.---WINDOWS
+ * On Linux you will need to configure large page support and increase your memlock limit (ulimit -l).---LINUX
+ *---LINUX
+ * To set large page support, add the following to "/etc/sysctl.d/60-hugepages.conf":---LINUX
+ *     vm.nr_hugepages=128---LINUX
+ * You WILL need to run "sudo sysctl --system" for these settings to take effect on your system (or reboot).---LINUX
+ *  In some cases (many threads, very large CPU, etc) you may need more than 128---LINUX
+ *   (try 256 if there are still complaints from thread inits)---LINUX
+ *---LINUX
+ * To increase the memlock (ulimit -l), add following lines to /etc/security/limits.d/60-memlock.conf:---LINUX
+ *     *    - memlock 262144---LINUX
+ *     root - memlock 262144---LINUX
+ * You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).---LINUX
+ *---LINUX
+ * Check with "/sbin/sysctl vm.nr_hugepages ; ulimit -l" to validate---LINUX
+ *---LINUX
+ * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX
+ * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX
+ * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX
  */
 
 /*
  * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here:
  * always  - Don't even try to use large pages. Always use slow memory.
  * warn    - We will try to use large pages, but fall back to slow memory if that fails.
- * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.
- *           It will never use slow memory, but it won't attempt to mlock
+ * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.---LINUX
+ *           It will never use slow memory, but it won't attempt to mlock---LINUX
  * never   - If we fail to allocate large pages we will print an error and exit.
  */
-"use_slow_memory" : "warn",
+"use_slow_memory" : "warn",---WINDOWS
+"use_slow_memory" : "no_mlck",---LINUX
 
 /*
  * TLS Settings
@@ -149,6 +160,4 @@ R"===(
  *               This setting will only be needed in 2020's. No need to worry about it now.
  */
 "prefer_ipv4" : true,
-
 )==="
-
diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp
index d95ea6b72..3f79df44c 100644
--- a/xmrstak/misc/configEditor.hpp
+++ b/xmrstak/misc/configEditor.hpp
@@ -6,6 +6,7 @@
 #include <streambuf>
 #include <regex>
 
+#include "../version.hpp"
 
 namespace xmrstak
 {
@@ -42,6 +43,24 @@ struct configEditor
 
 	void write(const std::string filename)
 	{
+		// endmarks: for filtering full lines inside the template string
+		// Platform marks are done globally here
+		// "---WINDOWS" endmark keeps lines when compiled for Windows
+		// "---LINUX"   endmark keeps lines when compiled for Linux (and anything not-windows)
+#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__WINDOWS__)
+		// windows:
+		//   completely drop lines with endmark-linux
+		replace(".*---LINUX\n", "");
+		//   strip off windows endmarks, keep the lines
+		replace("---WINDOWS\n", "\n");
+#else
+		// not-windows:
+		//   completely drop lines with endmark-windows
+		replace(".*---WINDOWS\n", "");
+		//   strip off linux endmarks, keep the lines
+		replace("---LINUX\n", "\n");
+#endif
+		replace("XMRSTAK_VERSION", get_version_str());
 		std::ofstream out(filename);
 		out << m_fileContent;
 		out.close();
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 9c3dd5a59..59c4ba9d6 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * pool_address    - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported.
  * wallet_address  - Your wallet, or pool login.
@@ -50,6 +51,4 @@ POOLCONF],
  */
 
 "currency" : "CURRENCY",
-
 )==="
-

From 1fbfb1547ce6794615d20b1525bdd8dec3995048 Mon Sep 17 00:00:00 2001
From: BBSCoin Developer <43017551+bbscoindev@users.noreply.github.com>
Date: Sat, 22 Sep 2018 23:41:25 -0700
Subject: [PATCH 41/77] Update BBSCoin config for preparing for the next fork

---
 xmrstak/jconf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index c69d47ab8..c0ef1a779 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -88,7 +88,7 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
 	{ "aeon7",               {cryptonight_aeon, cryptonight_lite, 7u},     {cryptonight_aeon, cryptonight_lite, 7u},     "mine.aeon-pool.com:5555" },
-	{ "bbscoin",             {cryptonight_monero, cryptonight, 3u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
+	{ "bbscoin",             {cryptonight_lite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "bittube",             {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
 	{ "cryptonight",         {cryptonight_monero, cryptonight, 255u},      {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},

From 957503b1d38583c5b80ab34fde25d71edff0ee48 Mon Sep 17 00:00:00 2001
From: BBSCoin Developer <43017551+bbscoindev@users.noreply.github.com>
Date: Sun, 23 Sep 2018 00:02:48 -0700
Subject: [PATCH 42/77] Change BBSCoin PoW to CN lite v7

---
 xmrstak/jconf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index c0ef1a779..355da8e6e 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -88,7 +88,7 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
 	{ "aeon7",               {cryptonight_aeon, cryptonight_lite, 7u},     {cryptonight_aeon, cryptonight_lite, 7u},     "mine.aeon-pool.com:5555" },
-	{ "bbscoin",             {cryptonight_lite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
+	{ "bbscoin",             {cryptonight_aeon, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "bittube",             {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
 	{ "cryptonight",         {cryptonight_monero, cryptonight, 255u},      {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},

From f03319c33429376d333510058579b7ead6241aec Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Sat, 22 Sep 2018 09:08:30 -0600
Subject: [PATCH 43/77] telemetry: Add mutex to avoid push during recalc and
 other races

---
 xmrstak/misc/telemetry.cpp | 2 ++
 xmrstak/misc/telemetry.hpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp
index 5642f6b66..197da8eca 100644
--- a/xmrstak/misc/telemetry.cpp
+++ b/xmrstak/misc/telemetry.cpp
@@ -49,6 +49,7 @@ telemetry::telemetry(size_t iThd)
 
 double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 {
+	std::unique_lock<std::mutex> lk(mtx);
 	uint64_t iTimeNow = get_timestamp_ms();
 
 	uint64_t iEarliestHashCnt = 0;
@@ -98,6 +99,7 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 
 void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp)
 {
+	std::unique_lock<std::mutex> lk(mtx);
 	size_t iTop = iBucketTop[iThd];
 	ppHashCounts[iThd][iTop] = iHashCount;
 	ppTimestamps[iThd][iTop] = iTimestamp;
diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp
index 309fd6d06..1813c00e6 100644
--- a/xmrstak/misc/telemetry.hpp
+++ b/xmrstak/misc/telemetry.hpp
@@ -2,6 +2,7 @@
 
 #include <cstdint>
 #include <cstring>
+#include <mutex>
 
 namespace xmrstak
 {
@@ -14,6 +15,7 @@ class telemetry
 	double calc_telemetry_data(size_t iLastMillisec, size_t iThread);
 
 private:
+	mutable std::mutex mtx;
 	constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations
 	constexpr static size_t iBucketMask = iBucketSize - 1;
 	uint32_t* iBucketTop;

From cac26b96d642f52071182b087f2001181d0d7a95 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 23 Sep 2018 21:02:29 +0200
Subject: [PATCH 44/77] iadd cryptonight_v8 tweak 2.2

add cpu implementation for the final monero POW
---
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 10 ++++--
 ...yptonight_v8_main_loop_ivybridge_linux.inc | 12 +++++--
 ...yptonight_v8_main_loop_ivybridge_win64.inc | 12 +++++--
 .../cryptonight_v8_main_loop_ryzen_linux.inc  | 10 ++++--
 .../cryptonight_v8_main_loop_ryzen_win64.inc  | 10 ++++--
 .../backend/cpu/crypto/cryptonight_aesni.h    | 31 ++++++++++++++-----
 xmrstak/backend/cpu/minethd.cpp               |  4 +--
 7 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 286bc39b6..e65f0ed05 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -748,19 +748,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
  			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
 #endif
+			ulong2 result_mul;
+			result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
+			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
 // cryptonight_monero_v8
 #if(ALGO==11)
 			{
-				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
 				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				result_mul ^= chunk2;
 				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
 				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 			}
 #endif
-			a[1] += c[0] * as_ulong2(tmp).s0;
-			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+			a[1] += result_mul.s1;
+			a[0] += result_mul.s0;
 
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
index 21f1f48c3..bc4a82f86 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret:
 	mov	 r9, r10
 	mov	 rax, rdi
 	mul	 rbp
+        movq xmm0, rax
+ 	movq xmm1, rdx
+ 	punpcklqdq xmm1, xmm0
 
 	xor	 r9, 16
 	mov	 rcx, r10
 	xor	 rcx, 32
 	xor	 r10, 48
-	add	 r8, rdx
-	add	 r11, rax
-	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+        pxor xmm2, xmm1
+ 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm0, xmm5
 	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+        xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
 	paddq	 xmm2, xmm4
 	paddq	 xmm1, xmm7
 	movdqa	 xmm5, xmm4
@@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret:
 	movdqa	 xmm4, xmm6
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+        add	 r8, rdx
+ 	add	 r11, rax
 	mov	 QWORD PTR [r14], r8
 	xor	 r8, rdi
 	mov	 r10, r8
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
index ee7f31716..3687d999b 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
@@ -113,17 +113,21 @@ sqrt_fixup_ivybridge_ret:
 	mov	 r9, r10
 	mov	 rax, rdi
 	mul	 rbp
+        movq xmm0, rax
+ 	movq xmm1, rdx
+ 	punpcklqdq xmm1, xmm0
 
 	xor	 r9, 16
 	mov	 rcx, r10
 	xor	 rcx, 32
 	xor	 r10, 48
-	add	 r8, rdx
-	add	 r11, rax
-	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
+        pxor xmm2, xmm1
+ 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm0, xmm5
 	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+        xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
 	paddq	 xmm2, xmm4
 	paddq	 xmm1, xmm7
 	movdqa	 xmm5, xmm4
@@ -131,6 +135,8 @@ sqrt_fixup_ivybridge_ret:
 	movdqa	 xmm4, xmm6
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+        add	 r8, rdx
+ 	add	 r11, rax
 	mov	 QWORD PTR [r14], r8
 	xor	 r8, rdi
 	mov	 r10, r8
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
index 9c177b85a..a375a661f 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -109,14 +109,20 @@ main_loop_ryzen:
 sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
+        movq xmm1, rax
+ 	movq xmm0, rdx
+ 	punpcklqdq xmm0, xmm1
 
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
-	movdqa	xmm0, XMMWORD PTR [r10+rbx]
-	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
+ 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+ 	pxor xmm2, xmm0
+ 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
 	paddq	xmm0, xmm4
 	paddq	xmm2, xmm3
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
index f70dccef8..a55004e42 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
@@ -109,14 +109,20 @@ main_loop_ryzen:
 sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
+        movq xmm1, rax
+ 	movq xmm0, rdx
+ 	punpcklqdq xmm0, xmm1
 
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
-	movdqa	xmm0, XMMWORD PTR [r10+rbx]
-	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	xor rdx, [rcx+rbx]
+ 	xor rax, [rcx+rbx+8]
+ 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+ 	pxor xmm2, xmm0
+ 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
 	paddq	xmm0, xmm4
 	paddq	xmm2, xmm3
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 6edae905e..c0f122fd6 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -543,7 +543,7 @@ inline void set_float_rounding_mode()
 #endif
 }
 
-#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
 	if(ALGO == cryptonight_monero_v8) \
 	{ \
@@ -556,6 +556,21 @@ inline void set_float_rounding_mode()
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
 	}
 
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		hi ^= ((uint64_t*)&chunk2)[0]; \
+		lo ^= ((uint64_t*)&chunk2)[1]; \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
+
 #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
 	if(ALGO == cryptonight_monero_v8) \
 	{ \
@@ -637,7 +652,7 @@ inline void set_float_rounding_mode()
 		else \
 			cx = _mm_aesenc_si128(cx, ax0); \
 	} \
-	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1)
+	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1)
 
 #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -659,18 +674,18 @@ inline void set_float_rounding_mode()
 	cl = ((uint64_t*)ptr0)[0]; \
 	ch = ((uint64_t*)ptr0)[1]; \
 	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
-	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \
-	if(ALGO == cryptonight_monero_v8) \
-	{ \
-		bx1 = bx0; \
-		bx0 = cx; \
-	} \
 	{ \
 		uint64_t hi; \
 		lo = _umul128(idx0, cl, &hi); \
+		CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
 		ah0 += lo; \
 		al0 += hi; \
 	} \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		bx1 = bx0; \
+		bx0 = cx; \
+	} \
 	((uint64_t*)ptr0)[0] = al0; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 05743ae92..a344a9ffe 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -310,11 +310,11 @@ bool minethd::self_test()
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
 			hashf("This is a test This is a test This is a test", 44, out, ctx);
-			bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+			bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
 			hashf("This is a test This is a test This is a test", 44, out, ctx);
-			bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+			bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 		}
 		else if(algo == cryptonight_aeon)
 		{

From 915c868a487141c9a05439c2facb0fa21b1b8c8b Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 24 Sep 2018 20:11:22 +0200
Subject: [PATCH 45/77] disbale CUDA backend for cryptonight_v8

---
 xmrstak/backend/nvidia/minethd.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 486a990e3..dc9b5fccf 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -144,6 +144,13 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 {
 	std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
 
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+	if(miner_algo == cryptonight_monero_v8)
+	{
+		std::cerr<<"ERROR: The CUDA backend is currently not supporting cryptonight_v8, please use `--openCLVendor NVIDIA` instead."<<std::endl;
+		return pvThreads;
+	}
+
 	if(!configEditor::file_exist(params::inst().configFileNVIDIA))
 	{
 		autoAdjust adjust;

From 5003079d1a78f3a22e74f8cf67b1eae271c4c87d Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 24 Sep 2018 20:21:09 +0200
Subject: [PATCH 46/77] optimize asm code cryptonight_v8

apply optimizations

Co-authored-by: SChernykh <sergey.v.chernykh@gmail.com>
---
 ...yptonight_v8_main_loop_ivybridge_linux.inc | 72 ++++++++++---------
 ...yptonight_v8_main_loop_ivybridge_win64.inc | 71 +++++++++---------
 .../cryptonight_v8_main_loop_ryzen_linux.inc  | 23 +++---
 .../cryptonight_v8_main_loop_ryzen_win64.inc  | 25 ++++---
 4 files changed, 99 insertions(+), 92 deletions(-)
 mode change 100644 => 100755 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
 mode change 100644 => 100755 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc

diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
index bc4a82f86..cbe43b0d3 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -48,10 +48,10 @@
 	punpcklqdq xmm4, xmm0
 	movq	 xmm0, rcx
 	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 
 	ALIGN 8
 main_loop_ivybridge:
-	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
 	mov	 eax, r10d
@@ -63,28 +63,30 @@ main_loop_ivybridge:
 	movq	 xmm7, r8
 	punpcklqdq xmm7, xmm0
 	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
 	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm1, xmm7
-	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
 	paddq	 xmm0, xmm5
 	paddq	 xmm2, xmm4
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
-	movq	 rcx, xmm3
 	movdqu	 XMMWORD PTR [rax+rbx], xmm2
-	mov	 rax, rcx
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
 	shl	 rax, 32
 	xor	 rdi, rax
-	movq	 rbp, xmm6
 	movdqa	 xmm0, xmm6
 	pxor	 xmm0, xmm4
-	mov	 r10, rbp
-	and	 r10d, 2097136
 	movdqu	 XMMWORD PTR [rdx], xmm0
-	xor	 rdi, QWORD PTR [r10+rbx]
-	lea	 r14, QWORD PTR [r10+rbx]
-	mov	 r12, QWORD PTR [r10+rbx+8]
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
 	xor	 edx, edx
 	lea	 r9d, DWORD PTR [ecx+ecx]
 	add	 r9d, ebp
@@ -93,6 +95,7 @@ main_loop_ivybridge:
 	or	 r9d, r13d
 	movq	 rax, xmm0
 	div	 r9
+	xorps xmm3, xmm3
 	mov	 eax, eax
 	shl	 rdx, 32
 	add	 rdx, rax
@@ -103,31 +106,37 @@ main_loop_ivybridge:
 	movq	 xmm0, rax
 	paddq	 xmm0, xmm8
 	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
 	movq	 rdx, xmm3
-	test	 rdx, 524287
+	test	 edx, 524287
 	je	 sqrt_fixup_ivybridge
 	psrlq	 xmm3, 19
 	psubq	 xmm3, XMMWORD PTR [rsp+16]
 sqrt_fixup_ivybridge_ret:
 
-	mov	 r9, r10
+	mov	 ecx, r10d
 	mov	 rax, rdi
 	mul	 rbp
-        movq xmm0, rax
- 	movq xmm1, rdx
- 	punpcklqdq xmm1, xmm0
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
 
-	xor	 r9, 16
-	mov	 rcx, r10
-	xor	 rcx, 32
-	xor	 r10, 48
-	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
-        pxor xmm2, xmm1
- 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm0, xmm5
 	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
-        xor rdx, [rcx+rbx]
- 	xor rax, [rcx+rbx+8]
 	paddq	 xmm2, xmm4
 	paddq	 xmm1, xmm7
 	movdqa	 xmm5, xmm4
@@ -135,13 +144,8 @@ sqrt_fixup_ivybridge_ret:
 	movdqa	 xmm4, xmm6
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
-        add	 r8, rdx
- 	add	 r11, rax
-	mov	 QWORD PTR [r14], r8
-	xor	 r8, rdi
-	mov	 r10, r8
-	mov	 QWORD PTR [r14+8], r11
-	and	 r10d, 2097136
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
 	xor	 r11, r12
 	dec rsi
 	jne	 main_loop_ivybridge
@@ -163,15 +167,15 @@ sqrt_fixup_ivybridge_ret:
 
 sqrt_fixup_ivybridge:
 	dec	 rdx
-	mov	r13d, -1022
- 	shl	r13, 32
+	mov r13d, -1022
+	shl r13, 32
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	not	r13
+	not r13
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
old mode 100644
new mode 100755
index 3687d999b..8d49c5db7
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
@@ -48,10 +48,10 @@
 	punpcklqdq xmm4, xmm0
 	movd	 xmm0, rcx
 	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 
 	ALIGN 8
 main_loop_ivybridge:
-	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
 	lea	 rdx, QWORD PTR [r10+rbx]
 	mov	 ecx, r10d
 	mov	 eax, r10d
@@ -63,28 +63,30 @@ main_loop_ivybridge:
 	movd	 xmm7, r8
 	punpcklqdq xmm7, xmm0
 	aesenc	 xmm6, xmm7
+	movd	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
 	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm1, xmm7
-	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
 	paddq	 xmm0, xmm5
 	paddq	 xmm2, xmm4
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
-	movd	 rcx, xmm3
 	movdqu	 XMMWORD PTR [rax+rbx], xmm2
-	mov	 rax, rcx
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movd	 rcx, xmm3
+	mov	 rax, rcx
 	shl	 rax, 32
 	xor	 rdi, rax
-	movd	 rbp, xmm6
 	movdqa	 xmm0, xmm6
 	pxor	 xmm0, xmm4
-	mov	 r10, rbp
-	and	 r10d, 2097136
 	movdqu	 XMMWORD PTR [rdx], xmm0
-	xor	 rdi, QWORD PTR [r10+rbx]
-	lea	 r14, QWORD PTR [r10+rbx]
-	mov	 r12, QWORD PTR [r10+rbx+8]
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
 	xor	 edx, edx
 	lea	 r9d, DWORD PTR [ecx+ecx]
 	add	 r9d, ebp
@@ -93,6 +95,7 @@ main_loop_ivybridge:
 	or	 r9d, r13d
 	movd	 rax, xmm0
 	div	 r9
+	xorps xmm3, xmm3
 	mov	 eax, eax
 	shl	 rdx, 32
 	add	 rdx, rax
@@ -103,31 +106,37 @@ main_loop_ivybridge:
 	movd	 xmm0, rax
 	paddq	 xmm0, xmm8
 	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
 	movd	 rdx, xmm3
-	test	 rdx, 524287
+	test	 edx, 524287
 	je	 sqrt_fixup_ivybridge
 	psrlq	 xmm3, 19
 	psubq	 xmm3, XMMWORD PTR [rsp+16]
 sqrt_fixup_ivybridge_ret:
 
-	mov	 r9, r10
+	mov	 ecx, r10d
 	mov	 rax, rdi
 	mul	 rbp
-        movq xmm0, rax
- 	movq xmm1, rdx
- 	punpcklqdq xmm1, xmm0
+	movd xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movd xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
 
-	xor	 r9, 16
-	mov	 rcx, r10
-	xor	 rcx, 32
-	xor	 r10, 48
-	movdqu	 xmm2, XMMWORD PTR [r9+rbx]
-        pxor xmm2, xmm1
- 	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
 	paddq	 xmm0, xmm5
 	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
-        xor rdx, [rcx+rbx]
- 	xor rax, [rcx+rbx+8]
 	paddq	 xmm2, xmm4
 	paddq	 xmm1, xmm7
 	movdqa	 xmm5, xmm4
@@ -135,13 +144,8 @@ sqrt_fixup_ivybridge_ret:
 	movdqa	 xmm4, xmm6
 	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
 	movdqu	 XMMWORD PTR [r10+rbx], xmm1
-        add	 r8, rdx
- 	add	 r11, rax
-	mov	 QWORD PTR [r14], r8
-	xor	 r8, rdi
-	mov	 r10, r8
-	mov	 QWORD PTR [r14+8], r11
-	and	 r10d, 2097136
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
 	xor	 r11, r12
 	dec rsi
 	jne	 main_loop_ivybridge
@@ -163,14 +167,15 @@ sqrt_fixup_ivybridge_ret:
 
 sqrt_fixup_ivybridge:
 	dec	 rdx
-	mov  r13, -4389456576512
+	mov r13d, -1022
+	shl r13, 32
 	mov	 rax, rdx
 	shr	 rdx, 19
 	shr	 rax, 20
 	mov	 rcx, rdx
 	sub	 rcx, rax
 	add	 rax, r13
-	mov  r13, 4389456576511
+	not r13
 	sub	 rcx, r13
 	mov	 r13d, -2147483647
 	imul	 rcx, rax
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
index a375a661f..cd8b43477 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -109,25 +109,24 @@ main_loop_ryzen:
 sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
-        movq xmm1, rax
- 	movq xmm0, rdx
- 	punpcklqdq xmm0, xmm1
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
 
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
-	xor rdx, [rcx+rbx]
- 	xor rax, [rcx+rbx+8]
- 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
- 	pxor xmm2, xmm0
- 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
-	paddq	xmm0, xmm4
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
 	paddq	xmm2, xmm3
 	paddq	xmm1, xmm6
-	movdqa	XMMWORD PTR [r9+rbx], xmm0
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
 	movdqa	XMMWORD PTR [rcx+rbx], xmm2
 	movdqa	XMMWORD PTR [r10+rbx], xmm1
 
@@ -163,8 +162,8 @@ sqrt_fixup_ryzen_ret:
 sqrt_fixup_ryzen:
 	movq r9, xmm2
 	dec	rdi
-	mov	edx, -1022
- 	shl	rdx, 32
+	mov edx, -1022
+	shl rdx, 32
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
old mode 100644
new mode 100755
index a55004e42..d103cc2ee
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
@@ -109,25 +109,24 @@ main_loop_ryzen:
 sqrt_fixup_ryzen_ret:
 	mov	rax, rsi
 	mul	r14
-        movq xmm1, rax
- 	movq xmm0, rdx
- 	punpcklqdq xmm0, xmm1
+	movd xmm1, rax
+	movd xmm0, rdx
+	punpcklqdq xmm0, xmm1
 
 	mov	r9d, r10d
 	mov	ecx, r10d
 	xor	r9d, 16
 	xor	ecx, 32
 	xor	r10d, 48
-	xor rdx, [rcx+rbx]
- 	xor rax, [rcx+rbx+8]
- 	movdqa	xmm2, XMMWORD PTR [r9+rbx]
- 	pxor xmm2, xmm0
- 	movdqa	xmm0, XMMWORD PTR [r10+rbx]
 	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
-	paddq	xmm0, xmm4
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
 	paddq	xmm2, xmm3
 	paddq	xmm1, xmm6
-	movdqa	XMMWORD PTR [r9+rbx], xmm0
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
 	movdqa	XMMWORD PTR [rcx+rbx], xmm2
 	movdqa	XMMWORD PTR [r10+rbx], xmm1
 
@@ -163,14 +162,14 @@ sqrt_fixup_ryzen_ret:
 sqrt_fixup_ryzen:
 	movd r9, xmm2
 	dec	rdi
-	mov rdx, 4389456576511
+	mov edx, -1022
+	shl rdx, 32
 	mov	rax, rdi
 	shr	rdi, 19
 	shr	rax, 20
 	mov	rcx, rdi
 	sub	rcx, rax
-	sub	rcx, rdx
-	mov rdx, -4389456576512
+	lea	rcx, [rcx+rdx+1]
 	add	rax, rdx
 	imul	rcx, rax
 	sub	rcx, r9

From 5db405c27842b35fcdd3488db344d10095c51013 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 29 Sep 2018 23:31:20 +0200
Subject: [PATCH 47/77] cuda: implement cryptonight_v8

- introduce a new schema where two threads work together on one hash
- update autoadjustment
- remove an mistake where shared memory was shrinked for gpus < sm_70
---
 xmrstak/backend/nvidia/minethd.cpp            |   5 -
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 465 ++++++++++++------
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |  27 +-
 3 files changed, 322 insertions(+), 175 deletions(-)

diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index dc9b5fccf..423cd201a 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -145,11 +145,6 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
 
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	if(miner_algo == cryptonight_monero_v8)
-	{
-		std::cerr<<"ERROR: The CUDA backend is currently not supporting cryptonight_v8, please use `--openCLVendor NVIDIA` instead."<<std::endl;
-		return pvThreads;
-	}
 
 	if(!configEditor::file_exist(params::inst().configFileNVIDIA))
 	{
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 563814702..a6501a9fb 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -69,9 +69,9 @@ typedef uint64_t IndexType;
 typedef int IndexType;
 #endif
 
-__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi )
+__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi )
 {
-	*product_hi = __umul64hi( multiplier, multiplicand );
+	product_hi = __umul64hi( multiplier, multiplicand );
 	return (multiplier * multiplicand );
 }
 
@@ -205,16 +205,67 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3
 	return tmp;
 }
 
+struct u64 : public uint2
+{
+
+	__forceinline__ __device__ u64(){}
+
+	__forceinline__ __device__ u64( const uint x0, const uint x1)
+	{
+		uint2::x = x0;
+		uint2::y = x1;
+	}
+
+	__forceinline__ __device__ operator uint64_t() const
+	{
+		return *((uint64_t*)this);
+	}
+
+	__forceinline__ __device__ u64( const uint64_t x0)
+	{
+		((uint64_t*)&this->x)[0] = x0;
+	}
+
+	__forceinline__ __device__ u64 operator^=(const u64& other)
+	{
+		uint2::x ^= other.x;
+		uint2::y ^= other.y;
+
+		return *this;
+	}
+
+	__forceinline__ __device__ u64 operator+(const u64& other) const
+	{
+		u64 tmp;
+		((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0];
+
+		return tmp;
+	}
+
+	__forceinline__ __device__ u64 operator+=(const uint64_t& other)
+	{
+		return ((uint64_t*)&this->x)[0] += other;
+	}
+
+	__forceinline__ __device__ void print(int i) const
+	{
+		if(i<2)
+			printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]);
+	}
+};
+
+
 template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
 #ifdef XMR_STAK_THREADS
-__launch_bounds__( XMR_STAK_THREADS * 4 )
+__launch_bounds__( XMR_STAK_THREADS * 2 )
 #endif
-__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
 	cn_aes_gpu_init( sharedMemory );
+
 	uint32_t* RCP;
 	if(ALGO == cryptonight_monero_v8)
 	{
@@ -226,6 +277,195 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		RCP = RCP_shared;
 	}
 
+#if( __CUDA_ARCH__ < 300 )
+	extern __shared__ u64 externShared[];
+	// 8 x 64bit values
+	u64* myChunks = (u64*)(externShared + (threadIdx.x >> 1) * 8);
+    volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
+#else
+	extern __shared__ u64 chunkMem[];
+    volatile uint32_t* sPtr = NULL;
+	// 8 x 64bit values
+	u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8);
+	
+#endif
+
+	__syncthreads( );
+
+	const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = tid >> 1;
+	const uint32_t sub = tid & 1;
+
+	if ( thread >= threads )
+		return;
+
+	uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
+
+	u64 ax0 = ((u64*)(d_ctx_a + thread * 4))[sub];
+	u64 bx0;
+	uint32_t idx0 = shuffle<2>(sPtr, sub, ax0.x, 0);
+
+	u64* ptr0;
+
+	u64 bx1;
+	uint32_t sqrt_result;
+	uint64_t division_result;
+	if(ALGO == cryptonight_monero_v8)
+	{
+		bx0 = ((u64*)(d_ctx_b + thread * 12))[sub];
+		bx1 = ((u64*)(d_ctx_b + thread * 12 + 4))[sub];
+
+		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
+		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
+	}
+	else
+		 bx0 = ((u64*)(d_ctx_b + thread * 4))[sub];
+
+	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	for(int i = start; i < end; ++i)
+	{
+		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
+
+		#pragma unroll 4
+		for(int x = 0; x < 8; x += 2)
+		{
+			myChunks[x + sub] = ptr0[ x + sub ];
+		}
+
+		uint32_t idx1 = (idx0 & 0x30) >> 3;
+
+		const u64 cx = myChunks[ idx1 + sub ];
+		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
+
+		u64 cx_aes = ax0 ^ u64(
+			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
+			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+		);
+	
+		if(ALGO == cryptonight_monero_v8)
+		{
+			
+			const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ];
+			const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ];
+			const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+			__syncwarp();
+#else
+			__syncthreads( );
+#endif
+			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+		}
+
+		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
+		for(int x = 0; x < 8; x += 2)
+			ptr0[ x + sub ] = myChunks[x + sub];
+
+		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
+		idx1 = (idx0 & 0x30) >> 3;
+		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
+		#pragma unroll 4
+		for(int x = 0; x < 8; x += 2)
+		{
+			myChunks[x + sub] = ptr0[ x + sub ];
+		}
+
+		if(ALGO != cryptonight_monero_v8)
+			bx0 = cx_aes;
+		
+		uint64_t cx_mul;
+		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
+		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
+
+		if(ALGO == cryptonight_monero_v8 && sub == 1)
+		{
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			((uint32_t*)&division_result)[1] ^= sqrt_result;
+
+			((uint64_t*)myChunks)[ idx1 ] ^= division_result;
+
+			const uint32_t dd = (static_cast<uint32_t>(cx_mul) + (sqrt_result << 1)) | 0x80000001UL;
+			division_result = fast_div_v2(RCP, cx_aes, dd);
+
+			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(cx_mul + division_result);
+		}
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
+#endif
+		uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ];
+	
+		{
+			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
+			// sub 0 -> hi, sub 1 -> lo
+			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
+			if(ALGO == cryptonight_monero_v8)
+			{
+				const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
+				u64 chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				res ^= ((uint64_t*)&chunk2)[0];
+				const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
+#endif
+				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			}
+			ax0 += res;
+		}
+		if(ALGO == cryptonight_monero_v8)
+		{
+			bx1 = bx0;
+			bx0 = cx_aes;
+		} 
+		myChunks[ idx1 + sub ] = ax0;
+		for(int x = 0; x < 8; x += 2)
+		{
+			ptr0[ x + sub ] = myChunks[x + sub];
+		}
+		ax0 ^= c;
+		idx0 = shuffle<2>(sPtr, sub, ax0.x, 0);
+	}
+
+	if ( bfactor > 0 )
+	{
+		((u64*)(d_ctx_a + thread * 4))[sub] = ax0;
+		if(ALGO == cryptonight_monero_v8)
+		{
+			((u64*)(d_ctx_b + thread * 12))[sub] = bx0;
+			((u64*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
+
+			if(sub == 1)
+			{
+				// must be valid only for `sub == 1`
+				((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result;
+				(d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result;
+			}
+		}
+		else
+			((u64*)(d_ctx_b + thread * 12))[sub] = bx0;
+	}
+}
+
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
+#ifdef XMR_STAK_THREADS
+__launch_bounds__( XMR_STAK_THREADS * 4 )
+#endif
+__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+		uint32_t startNonce, uint32_t * __restrict__ d_input )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
 
 	__syncthreads( );
 
@@ -272,20 +512,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			idx0 = *(d_ctx_b + threads * 4 + thread);
 		}
 	}
-
-	uint32_t bx1, sqrt_result;
-	uint64_t division_result;
-	if(ALGO == cryptonight_monero_v8)
-	{
-		d[1] = (d_ctx_b + thread * 12)[sub];
-		bx1 = (d_ctx_b + thread * 12 + 4)[sub];
-
-		// must be valid only for `sub < 2`
-		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
-		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
-	}
-	else
-		d[1] = (d_ctx_b + thread * 4)[sub];
+	d[1] = (d_ctx_b + thread * 4)[sub];
 
 	#pragma unroll 2
 	for ( i = start; i < end; ++i )
@@ -294,7 +521,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		for ( int x = 0; x < 2; ++x )
 		{
 			j = ( ( idx0 & MASK ) >> 2 ) + sub;
-			
+
 			if(ALGO == cryptonight_bittube2)
 			{
 				uint32_t k[4];
@@ -325,57 +552,6 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					}
 				}
 			}
-			else if(ALGO == cryptonight_monero_v8)
-			{
-
-				const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
-				uint4 chunk0{};
-				chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0);
-				chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0);
-				chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0);
-				chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0);
-
-				const uint32_t x_0 = ((uint32_t*)&chunk0)[sub];
-				const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4];
-				const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4];
-				const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4];
-				d[x] = a ^
-					t_fn0( x_0 & 0xff ) ^
-					t_fn1( (x_1 >> 8) & 0xff ) ^
-					t_fn2( (x_2 >> 16) & 0xff ) ^
-					t_fn3( ( x_3 >> 24 ) );
-
-				uint4 value;
-				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
-				if(sub == 1)
-					((uint64_t*)&value)[0] = tmp10;
-				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
-				if(sub == 1)
-					((uint64_t*)&value)[1] = tmp20;
-				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
-				if(sub == 2)
-					((uint64_t*)&value)[0] = tmp11;
-				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
-				if(sub == 2)
-					((uint64_t*)&value)[1] = tmp21;
-				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
-				if(sub == 3)
-					((uint64_t*)&value)[0] = tmp12;
-				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
-				if(sub == 3)
-					((uint64_t*)&value)[1] = tmp22;
-
-				if(sub > 0)
-				{
-					uint4 store{};
-					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
-					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
-
-					const int dest = sub + 1;
-					const int dest2 = dest == 4 ? 1 : dest;
-					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
-				}
-			}
 			else
 			{
 				const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
@@ -388,6 +564,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					t_fn2( (x_2 >> 16) & 0xff ) ^
 					t_fn3( ( x_3 >> 24 ) );
 			}
+
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
 			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
 
@@ -416,62 +593,10 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 			uint32_t yy[2];
 			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
-
-			if(ALGO == cryptonight_monero_v8 )
-			{
-				// Use division and square root results from the _previous_ iteration to hide the latency
-				const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);
-				((uint32_t*)&division_result)[1] ^= sqrt_result;
-		
-				if(sub < 2)
-					*((uint64_t*)yy) ^= division_result;
-
-				const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL;
-				const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
-				division_result = fast_div_v2(RCP, cx1, dd);
-			
-				// Use division_result as an input for the square root to prevent parallel implementation in hardware
-				sqrt_result = fast_sqrt_v2(cx0 + division_result);
-			}
-
 			uint32_t zz[2];
 			zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
 			zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
-			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
-			if(ALGO == cryptonight_monero_v8)
-			{
-				uint4 value;
-				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
-				if(sub == 1)
-					((uint64_t*)&value)[0] = tmp10;
-				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
-				if(sub == 1)
-					((uint64_t*)&value)[1] = tmp20;
-				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
-				if(sub == 2)
-					((uint64_t*)&value)[0] = tmp11;
-				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
-				if(sub == 2)
-					((uint64_t*)&value)[1] = tmp21;
-				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
-				if(sub == 3)
-					((uint64_t*)&value)[0] = tmp12;
-				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
-				if(sub == 3)
-					((uint64_t*)&value)[1] = tmp22;
-				if(sub > 0)
-				{
-					const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
-					uint4 store{};
-					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
-					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
-
-					const int dest = sub + 1;
-					const int dest2 = dest == 4 ? 1 : dest;
-					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
-				}
-			}
-			
+
 			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
 			#pragma unroll
 			for ( k = 0; k < 2; k++ )
@@ -521,31 +646,13 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 				idx0 = (~d) ^ q;
 			}
-			if(ALGO == cryptonight_monero_v8)
-			{
-				bx1 = d[(x + 1) % 2];
-			}
 		}
 	}
 
 	if ( bfactor > 0 )
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
-		if(ALGO == cryptonight_monero_v8)
-		{
-			(d_ctx_b + thread * 12)[sub] = d[1];
-			(d_ctx_b + thread * 12 + 4)[sub] = bx1;
-
-			if(sub < 2)
-			{
-				// must be valid only for `sub < 2`
-				(d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result;
-				(d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result;
-			}
-		}
-		else
-			(d_ctx_b + thread * 4)[sub] = d[1];
-			
+		(d_ctx_b + thread * 4)[sub] = d[1];
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
 			if(sub&1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
@@ -608,6 +715,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 {
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
+	dim3 block2( ctx->device_threads << 2 );
 	dim3 block4( ctx->device_threads << 2 );
 	dim3 block8( ctx->device_threads << 3 );
 
@@ -638,25 +746,53 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-        CUDA_CHECK_MSG_KERNEL(
-			ctx->device_id,
-			"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-			cryptonight_core_gpu_phase2<ITERATIONS,MEMORY,MASK,ALGO><<<
-				grid,
-				block4,
-				block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-			>>>(
-				ctx->device_blocks*ctx->device_threads,
-				ctx->device_bfactor,
-				i,
-				ctx->d_long_state,
-				ctx->d_ctx_a,
-				ctx->d_ctx_b,
-				ctx->d_ctx_state,
-				nonce,
-				ctx->d_input
-			)
-	    );
+		if(ALGO == cryptonight_monero_v8)
+		{
+			// two threads per block
+			CUDA_CHECK_MSG_KERNEL(
+				ctx->device_id,
+				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO><<<
+					grid,
+					block2,
+					sizeof(uint64_t) * block2.x * 8 +
+						// shuffle memory for fermi gpus
+						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+				>>>(
+					ctx->device_blocks*ctx->device_threads,
+					ctx->device_bfactor,
+					i,
+					ctx->d_long_state,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b,
+					ctx->d_ctx_state,
+					nonce,
+					ctx->d_input
+				)
+			);
+		}
+		else
+		{
+			CUDA_CHECK_MSG_KERNEL(
+				ctx->device_id,
+				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+				cryptonight_core_gpu_phase2_quad<ITERATIONS,MEMORY,MASK,ALGO><<<
+					grid,
+					block4,
+					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+				>>>(
+					ctx->device_blocks*ctx->device_threads,
+					ctx->device_bfactor,
+					i,
+					ctx->d_long_state,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b,
+					ctx->d_ctx_state,
+					nonce,
+					ctx->d_input
+				)
+			);
+		}
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
 	}
@@ -700,7 +836,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
 	}
-	else if(miner_algo == cryptonight_lite)
+	/*else if(miner_algo == cryptonight_lite)
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>(ctx, startNonce);
 	}
@@ -722,10 +858,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	}
 	else if(miner_algo == cryptonight_haven)
 	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce);
+	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce);
 	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
+	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
 	}
+	*/
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 1ea54ddba..a4d88f21f 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -283,13 +283,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		break;
 
 	};
-	const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1];
 
-	/* Disable L1 cache for GPUs before Volta.
-	 * L1 speed is increased and latency reduced with Volta.
-	 */
-	if(gpuArch < 70)
-		CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+	// prefer shared memory over L1 cache
+	CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
 
 	size_t hashMemSize = std::max(
 		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
@@ -691,6 +687,25 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			ctx->device_threads = 64;
 		}
 
+		// check if cryptonight_monero_v8 is selected for the user pool
+		bool useCryptonight_v8 =
+			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+		// overwrite default config if cryptonight_monero_v8 is mined
+		if(useCryptonight_v8)
+		{
+			// 4 based on my test maybe it must be adjusted later
+			size_t threads = 4;
+			// 8 is chosen by checking the occupancy calculator
+			size_t blockOptimal = 8 * ctx->device_mpcount;
+
+			if(blockOptimal * threads * hashMemSize < limitedMemory)
+			{
+				ctx->device_threads = threads;
+				ctx->device_blocks = blockOptimal;
+			}
+		}
 	}
 	printf("device init succeeded\n");
 

From 010cbd98bd618a70898aca14426c80d9ef963150 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 30 Sep 2018 22:11:29 +0200
Subject: [PATCH 48/77] cpu: fix missing `asm` autoadjust

In the auto adjust without hwlock the asm entry was missing
---
 xmrstak/backend/cpu/autoAdjust.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 28ff515d4..b192ddc35 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -58,7 +58,7 @@ class autoAdjust
 			if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
 				printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size);
 
-			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true,  \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
 			printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down.");
 		}
 		else

From 22e63ceb33d0ed71c26db94a4e22a608f57d28f1 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 1 Oct 2018 20:03:31 +0200
Subject: [PATCH 49/77] remove using of type `uint`

`uint` is unknown in windows, therefore switch to the better type `uint32_t`
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index a6501a9fb..a7bdaca5e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -210,7 +210,7 @@ struct u64 : public uint2
 
 	__forceinline__ __device__ u64(){}
 
-	__forceinline__ __device__ u64( const uint x0, const uint x1)
+	__forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1)
 	{
 		uint2::x = x0;
 		uint2::y = x1;

From f27ea67e72f8ab75292ba96293fdce277d0aa3cd Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 1 Oct 2018 20:07:38 +0200
Subject: [PATCH 50/77] add CUDA 10.0 support

- extent MSVC workaround for CUDA to 10.0
- add compute architecture 75 if CUDA 10.0 is found
---
 CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a642b385d..3b371b560 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,11 @@ if(CUDA_ENABLE)
                 list(APPEND DEFAULT_CUDA_ARCH "70")
             endif()
         endif()
+        # add Turing support for CUDA >= 10.0
+        if(NOT CUDA_VERSION VERSION_LESS 10.0)
+            list(APPEND DEFAULT_CUDA_ARCH "75")
+        endif()
+
         set(CUDA_ARCH "${DEFAULT_CUDA_ARCH}" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')")
 
         # generate comma separated list with architectures
@@ -186,7 +191,10 @@ if(CUDA_ENABLE)
             endif()
 
             if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND
-                (CUDA_VERSION VERSION_EQUAL 9.0 OR CUDA_VERSION VERSION_EQUAL 9.1 OR CUDA_VERSION VERSION_EQUAL 9.2)
+                (CUDA_VERSION VERSION_EQUAL 9.0 OR
+                CUDA_VERSION VERSION_EQUAL 9.1 OR
+                CUDA_VERSION VERSION_EQUAL 9.2 OR
+                CUDA_VERSION VERSION_EQUAL 10.0)
             )
                 # workaround find_package(CUDA) is using the wrong path to the CXX host compiler
                 # overwrite the CUDA host compiler variable with the used CXX MSVC

From 25634d4aab915c48c6deaf574990b72c5954454e Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 1 Oct 2018 22:02:16 +0200
Subject: [PATCH 51/77] cpu: asm double hash

- restructe asm preparation function
- add double hash asm code
---
 ..._v8_double_main_loop_sandybridge_linux.inc | 410 ++++++++++++++++++
 ..._v8_double_main_loop_sandybridge_win64.inc | 410 ++++++++++++++++++
 .../cpu/crypto/asm/cryptonight_v8_main_loop.S |  10 +
 .../crypto/asm/cryptonight_v8_main_loop.asm   |   7 +
 .../backend/cpu/crypto/cryptonight_aesni.h    |  68 ++-
 xmrstak/backend/cpu/minethd.cpp               |  21 +-
 6 files changed, 904 insertions(+), 22 deletions(-)
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
 create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc

diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
new file mode 100644
index 000000000..79adab671
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+ALIGN 16
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
new file mode 100644
index 000000000..ad8f18233
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movd	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movd	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movd xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movd	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movd	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN 64
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movd	xmm0, r11
+	movd	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movd	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movd	xmm0, rbp
+	movd	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movd	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movd	rdx, xmm5
+	shl	rdx, 32
+	movd	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movd xmm0, rdx
+	xor rdx, [r11+r13]
+	movd xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movd	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movd	r11, xmm0
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movd	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movd	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movd	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movd	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movd r9, xmm10
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movd xmm0, rax
+	movd xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movd	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movd xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movd	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movd xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
index b6be9438f..c0a3d0b41 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -9,6 +9,7 @@
 #endif
 .global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm)
 .global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm)
+.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm)
 
 ALIGN 8
 FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
@@ -25,3 +26,12 @@ FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
 	#include "cryptonight_v8_main_loop_ryzen_linux.inc"
 	add rsp, 48
 	ret 0
+
+ALIGN 16
+FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cryptonight_v8_double_main_loop_sandybridge_linux.inc"
+	add rsp, 48
+	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
index a1615e9bd..1f3d2e15c 100644
--- a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -1,6 +1,7 @@
 _TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
 PUBLIC cryptonight_v8_mainloop_ivybridge_asm
 PUBLIC cryptonight_v8_mainloop_ryzen_asm
+PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm
 
 ALIGN 8
 cryptonight_v8_mainloop_ivybridge_asm PROC
@@ -14,5 +15,11 @@ cryptonight_v8_mainloop_ryzen_asm PROC
 	ret 0
 cryptonight_v8_mainloop_ryzen_asm ENDP
 
+ALIGN 8
+cryptonight_v8_double_mainloop_sandybridge_asm PROC
+	INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc
+	ret 0
+cryptonight_v8_double_mainloop_sandybridge_asm ENDP
+
 _TEXT_CNV8_MAINLOOP ENDS
 END
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index c0f122fd6..e8c0aca2b 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -940,21 +940,63 @@ struct Cryptonight_hash<5>
 
 extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
 extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
 
-template<xmrstak_algo ALGO, int asm_version>
-void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+
+template< size_t N, size_t asm_version>
+struct Cryptonight_hash_asm;
+
+template<size_t asm_version>
+struct Cryptonight_hash_asm<1, asm_version>
 {
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 1;
 
-	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-	cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	if (asm_version == 1)
-		cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
-	else
-		cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
+		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
 
-	cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
-	keccakf((uint64_t*)ctx[0]->hash_state, 24);
-	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
-}
+		if(asm_version == 0)
+			cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
+		else if(asm_version == 1)
+			cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
+
+		cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		keccakf((uint64_t*)ctx[0]->hash_state, 24);
+		extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+	}
+};
+
+// double hash only for intel
+template< >
+struct Cryptonight_hash_asm<2, 0>
+{
+	static constexpr size_t N = 2;
+
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		for(size_t i = 0; i < N; ++i)
+		{
+			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			/* Optim - 99% time boundary */
+			cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+		}
+
+		cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
+
+		for(size_t i = 0; i < N; ++i)
+		{
+			/* Optim - 90% time boundary */
+			cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+			/* Optim - 99% time boundary */
+			keccakf((uint64_t*)ctx[i]->hash_state, 24);
+			extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		}
+	}
+};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index a344a9ffe..bb80b938f 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -455,24 +455,27 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	static_assert(N >= 1, "number of threads must be >= 1" );
 
 	// check for asm optimized version for cryptonight_v8
-	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
 	{
 		if(asm_version_str != "off")
 		{
+			if(asm_version_str != "intel" && asm_version_str != "ryzen")
+				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
+
 			if(asm_version_str == "intel")
 			{
 				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
-				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>;
+				if(N == 1)
+					return Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+				else if(N == 2)
+					return Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
 			}
-			if(asm_version_str == "ryzen")
+			// supports only 1 thread per hash
+			if(N == 1 && asm_version_str == "ryzen")
 			{
 				// AMD Ryzen (1xxx and 2xxx series)
-				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>;
-			}
-			else
-			{
-				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
-			}
+				return Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
+			}		
 		}
 	}
 	// We have two independent flag bits in the functions

From 1e5bb803a472b21672a69f2b5287c916fbd80f1d Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 1 Oct 2018 23:03:22 +0200
Subject: [PATCH 52/77] re-enable algorithm for cuda

I disabled a few algorithms for fatser compile and missed to re-enable them.
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index a7bdaca5e..0f6e47cca 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -836,7 +836,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
 	}
-	/*else if(miner_algo == cryptonight_lite)
+	else if(miner_algo == cryptonight_lite)
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>(ctx, startNonce);
 	}
@@ -864,5 +864,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	{
 	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
 	}
-	*/
+	
 }

From 70f3e82526cdd88607c55f7fab14f57ff0a5aba8 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 3 Oct 2018 20:45:29 +0200
Subject: [PATCH 53/77] rework all currencies

- introduce monero oct 2018 fork as currency `monero`
- remove monero7
- change all dev pools
  - those miner monero7 to handle the fork to monero
  - if the dev pool can not handle the fork to monero the currency is fixed set to `monero` (we can only handle 2 different currencies for user and dev pool)
- remove guards those prevent to use the currency `monero`
---
 doc/FAQ.md                     |  9 ++---
 xmrstak/backend/amd/config.tpl |  2 +-
 xmrstak/jconf.cpp              | 61 +++++++++++++---------------------
 xmrstak/pools.tpl              |  3 +-
 4 files changed, 27 insertions(+), 48 deletions(-)

diff --git a/doc/FAQ.md b/doc/FAQ.md
index 2d2820166..f744e3d24 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -9,7 +9,6 @@
 * [Virus Protection Alert](#virus-protection-alert)
 * [Change Currency to Mine](#change-currency-to-mine)
 * [How can I mine Monero](#how-can-i-mine-monero)
-* [Why is Monero named monero7](#why-is-monero-named-monero7)
 * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
 * [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error)
 
@@ -88,16 +87,12 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua
 
 If the miner is compiled for Monero and Aeon than you can change
  - the value `currency` in the config *or*
- - start the miner with the [command line option](usage.md) `--currency monero7` or `--currency aeon7`
+ - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7`
  - run `xmr-stak --help` to see all supported currencies and algorithms
 
 ## How can I mine Monero
 
-Set the value `currency` in `pools.txt` to `monero7`.
-
-## Why is Monero named monero7
-
-To avoid configuration conflicts after the hard fork of Monero to the new POW with our old naming schema where all cryptonight currencies was selected by choosing `monero` as currency we decided to switch to the name `monero7`.
+Set the value `currency` in `pools.txt` to `monero`.
 
 ## Which currency must be chosen if my fork coin is not listed
 
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 18ef8c696..0b5dcf863 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -10,7 +10,7 @@ R"===(// generated by XMRSTAK_VERSION
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
- *                             (not allowed for cryptonight_v8 and monero8)
+ *                             (not allowed for cryptonight_v8 and monero)
  *                 0 or false = use a contiguous block of memory per thread
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 355da8e6e..b608c0028 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -87,30 +87,29 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
-	{ "aeon7",               {cryptonight_aeon, cryptonight_lite, 7u},     {cryptonight_aeon, cryptonight_lite, 7u},     "mine.aeon-pool.com:5555" },
-	{ "bbscoin",             {cryptonight_aeon, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "bittube",             {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
-	{ "cryptonight",         {cryptonight_monero, cryptonight, 255u},      {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
-	{ "cryptonight_masari",  {cryptonight_monero, cryptonight_masari, 255u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr },
-	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr },
-	{ "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr },
-	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr },
-	{ "cryptonight_v7",      {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr },
-	{ "graft",               {cryptonight_monero, cryptonight, 8u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "haven",               {cryptonight_haven, cryptonight_heavy, 3u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "intense",             {cryptonight_monero, cryptonight, 4u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "masari",              {cryptonight_masari, cryptonight_monero, 7u},   {cryptonight_monero, cryptonight_monero, 0u},nullptr },
-	{ "monero7",             {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" },
-	{ "monero8",             {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	 {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "stellite",            {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "turtlecoin",          {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr }
+	{ "aeon7",               {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     "mine.aeon-pool.com:5555" },
+	{ "bbscoin",             {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
+	{ "bittube",             {cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
+	{ "cryptonight",         {cryptonight_monero_v8, cryptonight, 255u},          {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
+	{ "cryptonight_masari",  {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
+	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "graft",               {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "monero",              {cryptonight_monero_v8, cryptonight_monero, 8u},     {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
@@ -326,13 +325,6 @@ bool jconf::IsOnAlgoList(std::string& needle)
 {
 	std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower);
 
-	if(needle == "monero")
-	{
-		printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n"
-			"Please use 'monero7' (support auto switch to new POW) if you want to mine Monero, \nor name the coin that you want to mine.");
-		return false;
-	}
-
 	for(size_t i=0; i < coin_algo_size; i++)
 	{
 		if(needle == coins[i].coin_name)
@@ -617,13 +609,6 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 
 	for(size_t i=0; i < coin_algo_size; i++)
 	{
-		if(ctmp == "monero")
-		{
-			printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n"
-				"Please use monero7 (support auto switch to new POW) if you want to mine Monero, or name the coin that you want to mine.");
-			return false;
-		}
-
 		if(ctmp == coins[i].coin_name)
 		{
 			currentCoin = coins[i];
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 59c4ba9d6..3e21f416d 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -28,8 +28,7 @@ POOLCONF],
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    intense
  *    masari
- *    monero7
- *    monero8 (use this to support Monero's Oct 2018 fork)
+ *    monero (use this to support Monero's Oct 2018 fork)
  *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin

From b926a476fafbf445d52970116379fdc9a53c16a6 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Thu, 4 Oct 2018 11:12:09 -0600
Subject: [PATCH 54/77] spelling+typo touch-ups

---
 xmrstak/backend/cpu/autoAdjustHwloc.hpp | 5 +++--
 xmrstak/backend/nvidia/minethd.cpp      | 2 +-
 xmrstak/http/httpd.cpp                  | 6 +++---
 xmrstak/http/webdesign.cpp              | 2 +-
 xmrstak/http/webdesign.hpp              | 2 +-
 xmrstak/pools.tpl                       | 4 ++--
 6 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 2bebf82d0..7180491f7 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ class autoAdjust
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true,  \"asm\" : \"off\", \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
@@ -78,7 +78,8 @@ class autoAdjust
 		catch(const std::runtime_error& err)
 		{
 			// \todo add fallback to default auto adjust
-			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			conf += std::string("    { \"low_power_mode\" : false");
+			conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
 			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
 		}
 
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 423cd201a..0153eed19 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -277,7 +277,7 @@ void minethd::work_main()
 			if((round_ctr++ & 0xF) == 0)
 			{
 				globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16);
-				// check if the job is still valid, there is a small posibility that the job is switched
+				// check if the job is still valid, there is a small possibility that the job is switched
 				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
 					break;
 			}
diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp
index dba7d7cdb..ed9abc2bc 100644
--- a/xmrstak/http/httpd.cpp
+++ b/xmrstak/http/httpd.cpp
@@ -71,17 +71,17 @@ int httpd::req_handler(void * cls,
 		if (username == NULL)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
-			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, MHD_NO);
+			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO);
 			MHD_destroy_response(rsp);
 			return ret;
 		}
 		free(username);
 
-		ret = MHD_digest_auth_check(connection, sHttpAuthRelam, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
+		ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
 		if (ret == MHD_INVALID_NONCE || ret == MHD_NO)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
-			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
+			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
 			MHD_destroy_response(rsp);
 			return ret;
 		}
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index d6ee66e8d..93e217519 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -113,7 +113,7 @@ extern const char sHtmlCssFile [] =
 
 size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1;
 
-extern const char sHttpAuthRelam[] = "XMR-Stak-Miner";
+extern const char sHttpAuthRealm[] = "XMR-Stak-Miner";
 extern const char sHttpAuthOpaque[] = "6c071f0df539e234cadbcd79164af7a594e23ab42bccb834df796aead6ce96e4";
 
 extern const char sHtmlAccessDenied[] =
diff --git a/xmrstak/http/webdesign.hpp b/xmrstak/http/webdesign.hpp
index 48adfea98..bcbe5ae1d 100644
--- a/xmrstak/http/webdesign.hpp
+++ b/xmrstak/http/webdesign.hpp
@@ -7,7 +7,7 @@ extern size_t sHtmlCssSize;
 extern const char sHtmlAccessDenied[];
 extern size_t sHtmlAccessDeniedSize;
 
-extern const char sHttpAuthRelam[];
+extern const char sHttpAuthRealm[];
 extern const char sHttpAuthOpaque[];
 
 extern const char sHtmlCommonHeader[];
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 59c4ba9d6..37c532f3b 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -23,7 +23,7 @@ POOLCONF],
  *
  *    aeon7 (use this for Aeon's new PoW)
  *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
- *    bittube (uses cyrptonight_bittube2 algorithm)
+ *    bittube (uses cryptonight_bittube2 algorithm)
  *    graft
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    intense
@@ -45,7 +45,7 @@ POOLCONF],
  *    cryptonight_v7
  *    cryptonight_v8
  *    # 4MiB scratchpad memory
- *    cyrptonight_bittube2
+ *    cryptonight_bittube2
  *    cryptonight_haven
  *    cryptonight_heavy
  */

From 17e0b06eb83da2403eab61c2f7f270b79f7a0b48 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Thu, 4 Oct 2018 11:26:33 -0600
Subject: [PATCH 55/77] whitespace trims

---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl |  6 +++---
 xmrstak/backend/amd/config.tpl                    |  2 +-
 xmrstak/backend/amd/jconf.cpp                     |  2 +-
 xmrstak/backend/amd/minethd.cpp                   |  2 +-
 xmrstak/backend/amd/minethd.hpp                   |  2 +-
 xmrstak/backend/cpu/crypto/cryptonight_aesni.h    |  2 +-
 xmrstak/backend/cpu/minethd.cpp                   |  2 +-
 xmrstak/backend/globalStates.cpp                  |  6 +++---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu     | 14 +++++++-------
 xmrstak/cpputil/read_write_lock.h                 |  4 ++--
 10 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index e65f0ed05..fd630aff3 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -569,7 +569,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 #else
 #	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
 #endif
-	
+
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
@@ -581,7 +581,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong a[2];
 
 // cryptonight_monero_v8
-#if(ALGO==11)		
+#if(ALGO==11)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -813,7 +813,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = ((~d) ^ q) & MASK;
 #endif
-		
+
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 18ef8c696..98d90abe0 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -22,7 +22,7 @@ R"===(// generated by XMRSTAK_VERSION
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, 
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
  *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index fb1a04b4c..fab91d7e3 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -150,7 +150,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	}
 
 	cfg.memChunk = (int)memChunk->GetInt64();
-	
+
 	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128)
 	{
 		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two");
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 5ac246335..45979cbd6 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -236,7 +236,7 @@ void minethd::work_main()
 				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
 					break;
 			}
-			
+
 
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 04c2ff8ad..32e66ec87 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -31,7 +31,7 @@ class minethd  : public iBackend
 	void work_main();
 
 	uint64_t iJobNo;
-	
+
 	miner_work oWork;
 
 	std::promise<void> order_fix;
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index e8c0aca2b..2b1741764 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -797,7 +797,7 @@ template< >
 struct Cryptonight_hash<1>
 {
 	static constexpr size_t N = 1;
-	
+
 	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index bb80b938f..3a94daa5f 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -475,7 +475,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 			{
 				// AMD Ryzen (1xxx and 2xxx series)
 				return Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
-			}		
+			}
 		}
 	}
 	// We have two independent flag bits in the functions
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 3bd7d0eea..4eeed3c4b 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -39,7 +39,7 @@ void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId)
 
 	threadWork = oGlobalWork;
 	currentJobId = iGlobalJobNo.load(std::memory_order_relaxed);
-	
+
 	jobLock.UnLock();
 }
 
@@ -51,7 +51,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 	 * To avoid duplicated shared this must be done before the nonce is exchanged.
 	 */
 	iGlobalJobNo++;
-	
+
 	size_t xid = dat.pool_id;
 	dat.pool_id = pool_id;
 	pool_id = xid;
@@ -62,7 +62,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 	 */
 	dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed);
 	oGlobalWork = pWork;
-	
+
 	jobLock.UnLock();
 }
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 0f6e47cca..cceca876d 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -287,7 +287,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
     volatile uint32_t* sPtr = NULL;
 	// 8 x 64bit values
 	u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8);
-	
+
 #endif
 
 	__syncthreads( );
@@ -344,10 +344,10 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
 			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
 		);
-	
+
 		if(ALGO == cryptonight_monero_v8)
 		{
-			
+
 			const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ];
 			const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ];
 			const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ];
@@ -376,7 +376,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 
 		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
-		
+
 		uint64_t cx_mul;
 		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
 		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
@@ -400,7 +400,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 				__syncthreads( );
 #endif
 		uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ];
-	
+
 		{
 			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
 			// sub 0 -> hi, sub 1 -> lo
@@ -426,7 +426,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		{
 			bx1 = bx0;
 			bx0 = cx_aes;
-		} 
+		}
 		myChunks[ idx1 + sub ] = ax0;
 		for(int x = 0; x < 8; x += 2)
 		{
@@ -864,5 +864,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	{
 	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
 	}
-	
+
 }
diff --git a/xmrstak/cpputil/read_write_lock.h b/xmrstak/cpputil/read_write_lock.h
index 9139dfd22..51f42a2e6 100644
--- a/xmrstak/cpputil/read_write_lock.h
+++ b/xmrstak/cpputil/read_write_lock.h
@@ -61,7 +61,7 @@ class RWLock
 		std::unique_lock<std::mutex> lck(mtx_);
 		if (status_ == -1) {
 			status_ = 0;
-		} 
+		}
 		else
 		{
 			status_ -= 1;
@@ -72,7 +72,7 @@ class RWLock
 			{
 				write_cv_.notify_one();
 			}
-		} 
+		}
 		else
 		{
 			read_cv_.notify_all();

From 21ce03855d168b624f2fda67ad5ac933b3c6b74c Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Thu, 4 Oct 2018 21:35:21 +0200
Subject: [PATCH 56/77] add cpu family and model detection

Helper functions to select the asm version based on the number of used hashes per threads and the family name of the cpu.

- use the noew cpu type functions to fix the wrong AMD family detection in `autoAdjust.hpp`
- allow to set the asm version to `auto`
- rename asm option `intel` to `intel_avx`
- rename asm option `ryzen` to `amd_avx`

Co-authored-by: fireice-uk <fireice-uk@users.noreply.github.com>
---
 xmrstak/backend/cpu/autoAdjust.hpp      | 14 ++---
 xmrstak/backend/cpu/autoAdjustHwloc.hpp |  2 +-
 xmrstak/backend/cpu/config.tpl          | 13 ++--
 xmrstak/backend/cpu/cpuType.cpp         | 79 ++++++++++++++++++++++++
 xmrstak/backend/cpu/cpuType.hpp         | 32 ++++++++++
 xmrstak/backend/cpu/minethd.cpp         | 81 +++++++++++++++++--------
 6 files changed, 179 insertions(+), 42 deletions(-)
 create mode 100644 xmrstak/backend/cpu/cpuType.cpp
 create mode 100644 xmrstak/backend/cpu/cpuType.hpp

diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index b192ddc35..e7f3e9148 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -7,6 +7,7 @@
 #include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/params.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include <string>
 
 #ifdef _WIN32
@@ -20,14 +21,6 @@ namespace xmrstak
 {
 namespace cpu
 {
-// Mask bits between h and l and return the value
-// This enables us to put in values exactly like in the manual
-// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22)
-inline int32_t get_masked(int32_t val, int32_t h, int32_t l)
-{
-	val &= (0x7FFFFFFF >> (31-(h-l))) << l;
-	return val >> l;
-}
 
 class autoAdjust
 {
@@ -82,7 +75,7 @@ class autoAdjust
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
@@ -143,7 +136,8 @@ class autoAdjust
 			L3KB_size = get_masked(cpu_info[3], 31, 18) * 512;
 
 			::jconf::cpuid(1, 0, cpu_info);
-			if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen
+
+			if(getModel().family < 0x17) //0x17h is Zen
 				old_amd = true;
 
 			return true;
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 7180491f7..b61582588 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ class autoAdjust
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 37158d6e2..1a64860e4 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -11,10 +11,11 @@ R"===(// generated by XMRSTAK_VERSION
  * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [off, intel, ryzen]
- *                    - off: used the default implementation (no assembler version)
- *                    - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
- *                    - ryzen: AMD Ryzen (1xxx and 2xxx series)
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, off, intel_avx, amd_avx]
+ *                    - auto: xmr-stak will automatically detect the asm type (default)
+ *                    - off: disable the usage of optimized assembler
+ *                    - intel_avx: supports Intel cpus with avx instructions e.g. Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx
+ *                    - amd_avx: supports AMD cpus with avx instructions e.g. AMD Ryzen 1xxx and 2xxx series
  *
  * affine_to_cpu  - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
  *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting
@@ -27,8 +28,8 @@ R"===(// generated by XMRSTAK_VERSION
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
  * [
- *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 1 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
  * ],
  * If you do not wish to mine with your CPU(s) then use:
  * "cpu_threads_conf" :
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
new file mode 100644
index 000000000..5959b75cc
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -0,0 +1,79 @@
+
+#include "xmrstak/backend/cpu/cpuType.hpp"
+
+#include <cstring>
+#include <inttypes.h>
+#include <cstdio>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+namespace xmrstak
+{
+namespace cpu
+{
+	void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+	{
+		std::memset(val, 0, sizeof(int32_t)*4);
+
+	#ifdef _WIN32
+		__cpuidex(val, eax, ecx);
+	#else
+		__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+	#endif
+	}
+
+	int32_t get_masked(int32_t val, int32_t h, int32_t l)
+	{
+		val &= (0x7FFFFFFF >> (31-(h-l))) << l;
+		return val >> l;
+	}
+
+	bool has_feature(int32_t val, int32_t bit)
+	{
+		int32_t mask = 1 << bit;
+		return (val & mask) != 0u;
+		
+	}
+	
+	Model getModel()
+	{
+		int32_t cpu_info[4];
+		char cpustr[13] = {0};
+
+		cpuid(0, 0, cpu_info);
+		std::memcpy(cpustr, &cpu_info[1], 4);
+		std::memcpy(cpustr+4, &cpu_info[3], 4);
+		std::memcpy(cpustr+8, &cpu_info[2], 4);
+
+		Model result;
+
+		cpuid(1, 0, cpu_info);
+		
+		result.family = get_masked(cpu_info[0], 12, 8);
+		result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
+		result.type_name = cpustr;
+
+		// feature bits https://en.wikipedia.org/wiki/CPUID
+		// sse2
+		result.sse2 = has_feature(cpu_info[3], 26);
+		// aes-ni
+		result.aes = has_feature(cpu_info[2], 25);
+		// avx
+		result.avx = has_feature(cpu_info[2], 28);	
+
+		if(strcmp(cpustr, "AuthenticAMD") == 0)
+		{
+			if(result.family == 0xF)
+				result.family += get_masked(cpu_info[0], 28, 20);
+		}
+
+		return result;
+	}
+
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp
new file mode 100644
index 000000000..7f6bfaf51
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <string>
+#include <cstdint>
+
+
+namespace xmrstak
+{
+namespace cpu
+{
+	struct Model
+	{
+		uint32_t family = 0u;
+		uint32_t model = 0u;
+		bool aes = false;
+		bool sse2 = false;
+		bool avx = false;
+		std::string type_name = "unknown";
+	};
+
+	Model getModel();
+
+	/** Mask bits between h and l and return the value
+	 *
+	 * This enables us to put in values exactly like in the manual
+	 * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22)
+	 */
+	int32_t get_masked(int32_t val, int32_t h, int32_t l);
+
+	
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 3a94daa5f..795ed1b65 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -27,6 +27,7 @@
 #include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include "xmrstak/params.hpp"
 #include "jconf.hpp"
 
@@ -449,35 +450,33 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 	return pvThreads;
 }
 
+/** get the supported asm name
+ *
+ * @return asm type based on the number of hashes per thread the internal
+ *             evaluated cpu type
+ */
+static std::string getAsmName(const uint32_t num_hashes)
+{
+	std::string asm_type = "off";
+	if(num_hashes == 0)
+		return asm_type;
+
+	auto cpu_model = getModel();
+
+	if(cpu_model.avx && cpu_model.aes)
+	{
+		if(cpu_model.type_name.find("Intel") != std::string::npos)
+			asm_type = "intel_avx";
+		else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1)
+			asm_type = "amd_avx";
+	}
+}
+
 template<size_t N>
 minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
 
-	// check for asm optimized version for cryptonight_v8
-	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
-	{
-		if(asm_version_str != "off")
-		{
-			if(asm_version_str != "intel" && asm_version_str != "ryzen")
-				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
-
-			if(asm_version_str == "intel")
-			{
-				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
-				if(N == 1)
-					return Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
-				else if(N == 2)
-					return Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
-			}
-			// supports only 1 thread per hash
-			if(N == 1 && asm_version_str == "ryzen")
-			{
-				// AMD Ryzen (1xxx and 2xxx series)
-				return Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
-			}
-		}
-	}
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
@@ -584,7 +583,39 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	digit.set(0, !bHaveAes);
 	digit.set(1, !bNoPrefetch);
 
-	return func_table[ algv << 2 | digit.to_ulong() ];
+	auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
+
+
+        // check for asm optimized version for cryptonight_v8
+        if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
+        {
+                std::string selected_asm = asm_version_str;
+                if(selected_asm == "auto")
+                        selected_asm = cpu::getAsmName(N);
+
+                if(selected_asm != "off")
+                {
+                        if(selected_asm == "intel_avx")
+                        {
+                                // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+                                if(N == 1)
+                                        selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+                                else if(N == 2)
+                                        selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
+                        }
+                        // supports only 1 thread per hash
+                        if(N == 1 && selected_asm == "amd_avx")
+                        {
+                                // AMD Ryzen (1xxx and 2xxx series)
+                                selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
+                        }
+                        if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
+                                printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
+						else if(selected_asm != "intel_avx" || selected_asm != "amd_avx") // unknown asm type
+                                printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
+                }
+        }
+	return selected_function;
 }
 
 minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)

From 5df8075715d7d8d06f45994e2462eac6a7ae16a6 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 5 Oct 2018 20:42:19 +0200
Subject: [PATCH 57/77] fix wrong option in config.tpl

In #1839 the option for slow memory is sneaky changed. This can lead into crashes on linux systems where the user is not allowed to use large pages.
---
 xmrstak/config.tpl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index deb52aa09..96f0e9cb2 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -104,8 +104,7 @@ R"===(// generated by XMRSTAK_VERSION
  *           It will never use slow memory, but it won't attempt to mlock---LINUX
  * never   - If we fail to allocate large pages we will print an error and exit.
  */
-"use_slow_memory" : "warn",---WINDOWS
-"use_slow_memory" : "no_mlck",---LINUX
+"use_slow_memory" : "warn",
 
 /*
  * TLS Settings

From 99a12cb6b155f27a8c62964efbdea37174224512 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 5 Oct 2018 21:05:18 +0200
Subject: [PATCH 58/77] CUDA: tine cryptonight_v8

Read memory in bigger chunks per thread to increase the used memory bandwith.
Use for Kepla and Fermi GPUs the old autosuggestion instead of the new settings for cryptonight_v8.
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 21 +++++--------------
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |  4 ++--
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 0f6e47cca..22bcf16eb 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -329,11 +329,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	{
 		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
 
-		#pragma unroll 4
-		for(int x = 0; x < 8; x += 2)
-		{
-			myChunks[x + sub] = ptr0[ x + sub ];
-		}
+		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
 
 		uint32_t idx1 = (idx0 & 0x30) >> 3;
 
@@ -362,17 +358,13 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		}
 
 		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
-		for(int x = 0; x < 8; x += 2)
-			ptr0[ x + sub ] = myChunks[x + sub];
+		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
 		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
-		#pragma unroll 4
-		for(int x = 0; x < 8; x += 2)
-		{
-			myChunks[x + sub] = ptr0[ x + sub ];
-		}
+
+		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
 
 		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
@@ -428,10 +420,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			bx0 = cx_aes;
 		} 
 		myChunks[ idx1 + sub ] = ax0;
-		for(int x = 0; x < 8; x += 2)
-		{
-			ptr0[ x + sub ] = myChunks[x + sub];
-		}
+		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
 		ax0 ^= c;
 		idx0 = shuffle<2>(sPtr, sub, ax0.x, 0);
 	}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index a4d88f21f..f136744d4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -692,8 +692,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
 			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;
 
-		// overwrite default config if cryptonight_monero_v8 is mined
-		if(useCryptonight_v8)
+		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
+		if(useCryptonight_v8 && gpuArch >= 50)
 		{
 			// 4 based on my test maybe it must be adjusted later
 			size_t threads = 4;

From 8e1e7447c2c7d61a1c2f016d5e285c9a6d65ae9f Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 5 Oct 2018 22:21:52 +0200
Subject: [PATCH 59/77] fix invalid shares

With rocm we fighted very long with invalid shares. This is now solved with rocm 1.9 and
this tiny fix.
It is not fully clear where a memory optimization is kicking in and break the kernel `Groestl` if the variables `M` and `H` are not `volatile`.
The performance ill not change with this fix.

The fix is tested with rocm 1.9 with a VEGA64 and a RX570
---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index e65f0ed05..317352722 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -1221,7 +1221,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		#pragma unroll 4
 		for(uint i = 0; i < 4; ++i)
 		{
-			ulong H[8], M[8];
+			volatile ulong H[8], M[8];
 
 			if(i < 3)
 			{

From 2370aeef739fd3901359b00d562ec99625b5099e Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Fri, 5 Oct 2018 22:19:15 -0600
Subject: [PATCH 60/77] Fix two new warnings within new code

---
 xmrstak/backend/amd/amd_gpu/gpu.cpp |  2 +-
 xmrstak/backend/cpu/minethd.cpp     | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index e2c2dfeb8..2fe0350a7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -377,7 +377,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 	std::vector<char> openCLDriverVer(1024);
-	if(ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL) != CL_SUCCESS)
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
 		return ERR_OCL_API;
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 795ed1b65..912ef48bb 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -458,18 +458,19 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 static std::string getAsmName(const uint32_t num_hashes)
 {
 	std::string asm_type = "off";
-	if(num_hashes == 0)
-		return asm_type;
-
-	auto cpu_model = getModel();
-
-	if(cpu_model.avx && cpu_model.aes)
+	if(num_hashes != 0)
 	{
-		if(cpu_model.type_name.find("Intel") != std::string::npos)
-			asm_type = "intel_avx";
-		else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1)
-			asm_type = "amd_avx";
+		auto cpu_model = getModel();
+
+		if(cpu_model.avx && cpu_model.aes)
+		{
+			if(cpu_model.type_name.find("Intel") != std::string::npos)
+				asm_type = "intel_avx";
+			else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1)
+				asm_type = "amd_avx";
+		}
 	}
+	return asm_type;
 }
 
 template<size_t N>

From 746037d8fb33608224d6c2f17cbda91e5a328d3c Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 7 Oct 2018 10:05:57 +0200
Subject: [PATCH 61/77] OpenCL: fix definition range for unroll

fix #1870

- remove zero from the valod definition range for the loop unroll option
---
 xmrstak/backend/amd/config.tpl | 2 +-
 xmrstak/backend/amd/jconf.cpp  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index b852a7e81..c3da93a38 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -15,7 +15,7 @@ R"===(// generated by XMRSTAK_VERSION
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
  *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
- * unroll        - allow to control how often the POW main loop is unrolled; valid range [0;128) - for most OpenCL implementations it must be a power of two.
+ * unroll        - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
  * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index fab91d7e3..152f8add4 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -151,9 +151,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 
 	cfg.memChunk = (int)memChunk->GetInt64();
 
-	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128)
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || (int)unroll->GetInt64() == 0)
 	{
-		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two");
+		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and not zero");
 		return false;
 	}
 	cfg.unroll = (int)unroll->GetInt64();

From 1c0ef1548f1890cb80c5e41d12b42987ed3fb6a1 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 7 Oct 2018 21:30:48 +0200
Subject: [PATCH 62/77] fix crash with monero and strided_index

Strided index 1 is not allowed for cryptonight_v8 and monero.
In the case the dev pool is set to monero and the user tuned there settings for
an other currency the miner will crash if strided index or memChunk is not
fitting the requirement to mine monero.
This PR detects wrong configurations and will set strided index and memChunk to a valid
value but only for cryptonight_v8. The user pool settings will only be changed if monero or
cryptonight_v8 is selected.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 32 ++++++++++++++---------------
 xmrstak/backend/amd/config.tpl      |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 2fe0350a7..7c7aff788 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -396,12 +396,26 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		int threadMemMask = cn_select_mask(miner_algo[ii]);
 		int hashIterations = cn_select_iter(miner_algo[ii]);
 
+		size_t mem_chunk_exp = 1u << ctx->memChunk;
+		size_t strided_index = ctx->stridedIndex;
+		/* Adjust the config settings to a valid combination
+		 * this is required if the dev pool is mining monero
+		 * but the user tuned there settings for another currency
+		 */
+		if(miner_algo[ii] == cryptonight_monero_v8)
+		{
+			if(ctx->memChunk < 2)
+				mem_chunk_exp = 1u << 2;
+			if(strided_index == 1)
+				strided_index = 0;
+		}
+
 		std::string options;
 		options += " -DITERATIONS=" + std::to_string(hashIterations);
 		options += " -DMASK=" + std::to_string(threadMemMask);
 		options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
-		options += " -DSTRIDED_INDEX=" + std::to_string(ctx->stridedIndex);
-		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(1u << ctx->memChunk);
+		options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
+		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp);
 		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
 		options += " -DMEMORY=" + std::to_string(hashMemSize);
 		options += " -DALGO=" + std::to_string(miner_algo[ii]);
@@ -931,20 +945,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
 		}
 
-		if(useCryptonight_v8)
-		{
-			if(ctx[i].stridedIndex == 1)
-			{
-				printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
-				return ERR_STUPID_PARAMS;
-			}
-			if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2)
-			{
-				printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
-				return ERR_STUPID_PARAMS;
-			}
-		}
-
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index b852a7e81..49033c81b 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -10,7 +10,7 @@ R"===(// generated by XMRSTAK_VERSION
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
- *                             (not allowed for cryptonight_v8 and monero)
+ *                             (for cryptonight_v8 and monero it is equal to strided_index = 0)
  *                 0 or false = use a contiguous block of memory per thread
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2

From 53652d35e707493416e0cdd5f8cbd9479294ac42 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 7 Oct 2018 10:11:06 +0200
Subject: [PATCH 63/77] CPU: fix logical error

Fix wrong warning about unknown ASM type
---
 xmrstak/backend/cpu/minethd.cpp | 59 +++++++++++++++++----------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 795ed1b65..ccf802e12 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -586,35 +586,36 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
 
 
-        // check for asm optimized version for cryptonight_v8
-        if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
-        {
-                std::string selected_asm = asm_version_str;
-                if(selected_asm == "auto")
-                        selected_asm = cpu::getAsmName(N);
-
-                if(selected_asm != "off")
-                {
-                        if(selected_asm == "intel_avx")
-                        {
-                                // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
-                                if(N == 1)
-                                        selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
-                                else if(N == 2)
-                                        selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
-                        }
-                        // supports only 1 thread per hash
-                        if(N == 1 && selected_asm == "amd_avx")
-                        {
-                                // AMD Ryzen (1xxx and 2xxx series)
-                                selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
-                        }
-                        if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
-                                printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
-						else if(selected_asm != "intel_avx" || selected_asm != "amd_avx") // unknown asm type
-                                printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
-                }
-        }
+	// check for asm optimized version for cryptonight_v8
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
+	{
+		std::string selected_asm = asm_version_str;
+		if(selected_asm == "auto")
+				selected_asm = cpu::getAsmName(N);
+
+		if(selected_asm != "off")
+		{
+			if(selected_asm == "intel_avx")
+			{
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				if(N == 1)
+					selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+				else if(N == 2)
+					selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
+			}
+			// supports only 1 thread per hash
+			if(N == 1 && selected_asm == "amd_avx")
+			{
+				// AMD Ryzen (1xxx and 2xxx series)
+				selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
+			}
+			if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
+				printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
+			else if(selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type
+				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
+		}
+	}
+	
 	return selected_function;
 }
 

From eb8376faece53483f54cfa106254f11fab2d4d6d Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 8 Oct 2018 09:21:42 +0200
Subject: [PATCH 64/77] CUDA: use volatile pointer

Use volatile pointer to be sure that the compiler is not caching the values.
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 2be6f969f..b844e10c8 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -278,15 +278,15 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	}
 
 #if( __CUDA_ARCH__ < 300 )
-	extern __shared__ u64 externShared[];
+	extern __shared__ uint64_t externShared[];
 	// 8 x 64bit values
-	u64* myChunks = (u64*)(externShared + (threadIdx.x >> 1) * 8);
+	volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
     volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
 #else
-	extern __shared__ u64 chunkMem[];
+	extern __shared__ uint64_t chunkMem[];
     volatile uint32_t* sPtr = NULL;
 	// 8 x 64bit values
-	u64* myChunks = (u64*)(chunkMem + (threadIdx.x >> 1) * 8);
+	volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
 
 #endif
 
@@ -301,25 +301,25 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 
 	uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
 
-	u64 ax0 = ((u64*)(d_ctx_a + thread * 4))[sub];
-	u64 bx0;
-	uint32_t idx0 = shuffle<2>(sPtr, sub, ax0.x, 0);
+	uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
+	uint64_t bx0;
+	uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 
-	u64* ptr0;
+	uint64_t* ptr0;
 
-	u64 bx1;
+	uint64_t bx1;
 	uint32_t sqrt_result;
 	uint64_t division_result;
 	if(ALGO == cryptonight_monero_v8)
 	{
-		bx0 = ((u64*)(d_ctx_b + thread * 12))[sub];
-		bx1 = ((u64*)(d_ctx_b + thread * 12 + 4))[sub];
+		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
+		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
 
 		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
 		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
 	}
 	else
-		 bx0 = ((u64*)(d_ctx_b + thread * 4))[sub];
+		 bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
 
 	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
 	const int start = partidx * batchsize;
@@ -327,7 +327,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 
 	for(int i = start; i < end; ++i)
 	{
-		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
+		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
 
 		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
 
@@ -344,9 +344,9 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		if(ALGO == cryptonight_monero_v8)
 		{
 
-			const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ];
-			const u64 chunk2 = myChunks[ idx1 ^ 4 + sub ];
-			const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ];
+			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
+			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+			const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
 #if (__CUDACC_VER_MAJOR__ >= 9)
 			__syncwarp();
 #else
@@ -362,7 +362,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
-		ptr0 = (u64 *)&l0[idx0 & MASK & 0x1FFFC0];
+		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
 
 		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
 
@@ -399,10 +399,10 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
 			if(ALGO == cryptonight_monero_v8)
 			{
-				const u64 chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
-				u64 chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
+				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
 				res ^= ((uint64_t*)&chunk2)[0];
-				const u64 chunk3 = myChunks[ idx1 ^ 6 + sub ];
+				const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
 #if (__CUDACC_VER_MAJOR__ >= 9)
 				__syncwarp();
 #else
@@ -422,16 +422,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		myChunks[ idx1 + sub ] = ax0;
 		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
 		ax0 ^= c;
-		idx0 = shuffle<2>(sPtr, sub, ax0.x, 0);
+		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 	}
 
 	if ( bfactor > 0 )
 	{
-		((u64*)(d_ctx_a + thread * 4))[sub] = ax0;
+		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
 		if(ALGO == cryptonight_monero_v8)
 		{
-			((u64*)(d_ctx_b + thread * 12))[sub] = bx0;
-			((u64*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
+			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
+			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
 
 			if(sub == 1)
 			{
@@ -441,7 +441,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((u64*)(d_ctx_b + thread * 12))[sub] = bx0;
+			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
 	}
 }
 

From 9e592ec58211b91557f955718a02dc02f90981db Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 8 Oct 2018 14:48:22 +0200
Subject: [PATCH 65/77] compatibility and better messages

- add more descriptive messages if memory allocation fails
- add gnu compiler flags: `noexecstack` to support systemd
- handle cases where memroy allocation fails

Co-authored-by: Tony Butler <spudz76@gmail.com>
---
 CMakeLists.txt                                   | 10 ++++++++--
 xmrstak/backend/amd/minethd.cpp                  |  5 +++++
 .../backend/cpu/crypto/cryptonight_common.cpp    | 16 +++++++++-------
 xmrstak/backend/cpu/minethd.cpp                  |  8 ++++++++
 xmrstak/cli/cli-miner.cpp                        |  1 +
 xmrstak/config.tpl                               |  5 ++---
 6 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 712fb429e..b714ee0ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -443,6 +443,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     endif()
 endif()
 
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    set(CMAKE_CXX_FLAGS "-Wl,-z,noexecstack ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}")
+endif()
+
 # activate static libgcc and libstdc++ linking
 if(CMAKE_LINK_STATIC)
     set(BUILD_SHARED_LIBRARIES OFF)
@@ -464,7 +469,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
 else()
     # asm optimized monero v8 code
     enable_language(ASM)
-    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY C)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
+    set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
     add_library(xmr-stak-asm
         STATIC
         "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
@@ -587,4 +593,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
 else()
     # this rule is used if the install prefix is the build directory
     install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")")
-endif()
\ No newline at end of file
+endif()
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 45979cbd6..5e70f25a6 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -174,6 +174,11 @@ void minethd::work_main()
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 
+	if(cpu_ctx == nullptr)
+	{
+		printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory, miner will be stopped.");
+		win_exit(1);
+	}
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
 	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a478c9b2a..a7e4696a8 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -216,6 +216,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize);
 		ptr->ctx_info[0] = 0;
 		ptr->ctx_info[1] = 0;
+		if(ptr->long_state == NULL)
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str());
 		return ptr;
 	}
 
@@ -243,25 +245,25 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		return ptr;
 	}
 #else
-
+//http://man7.org/linux/man-pages/man2/mmap.2.html
 #if defined(__APPLE__)
-	ptr->long_state  = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state  = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
 #elif defined(__OpenBSD__)
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
-		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
 #endif
 
 	if (ptr->long_state == MAP_FAILED)
 	{
 		_mm_free(ptr);
-		msg->warning = "mmap failed";
+		msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'";
 		return NULL;
 	}
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 912ef48bb..3e90159ea 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -226,6 +226,7 @@ bool minethd::self_test()
 	{
 		if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
 		{
+			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
 			for (int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			return false;
@@ -683,6 +684,13 @@ void minethd::multiway_work_main()
 	for (size_t i = 0; i < N; i++)
 	{
 		ctx[i] = minethd_alloc_ctx();
+		if(ctx[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
+			for (int j = 0; j < i; j++)
+				cryptonight_free_ctx(ctx[j]);
+			win_exit(1);
+		}
 		piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24);
 		piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr;
 	}
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index ae39d2505..428952b1b 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -749,6 +749,7 @@ int main(int argc, char *argv[])
 
 	if (!BackendConnector::self_test())
 	{
+		printer::inst()->print_msg(L0, "Self test not passed!");
 		win_exit();
 		return 1;
 	}
diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index 96f0e9cb2..73ae054c2 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -94,9 +94,8 @@ R"===(// generated by XMRSTAK_VERSION
  * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX
  * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX
  * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX
- */
-
-/*
+ *
+ *
  * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here:
  * always  - Don't even try to use large pages. Always use slow memory.
  * warn    - We will try to use large pages, but fall back to slow memory if that fails.

From 801556f693988045818d334d359045d8df26acc9 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 8 Oct 2018 21:15:20 +0200
Subject: [PATCH 66/77] select hash function from function array

Use an array  instead of a if cascade to select the hasing function for CUDA.
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 65 ++++++-------------
 1 file changed, 20 insertions(+), 45 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index b844e10c8..8e69c2029 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -807,51 +807,26 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 	}
 }
 
+typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
+
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
-	if(miner_algo == cryptonight_monero)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_monero_v8)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_heavy)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_lite)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_aeon)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_ipbc)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_stellite)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_masari)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_haven)
-	{
-	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_bittube2)
-	{
-	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
-	}
-
+	if(miner_algo == invalid_algo) return;
+	
+	static const cuda_hash_fn func_table[] = {
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8>
+	};
+
+	cuda_hash_fn selected_function = func_table[ miner_algo - 1u ];
+	selected_function(ctx, startNonce);
 }

From 594a5b4d5b515af2b4f66cf940c10e103ceee40a Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 8 Oct 2018 21:43:25 +0200
Subject: [PATCH 67/77] CUDA: add compatibility mode

Add compatibility mode for CUDA to avoid invalid shares.
---
 xmrstak/backend/nvidia/autoAdjust.hpp         |   1 +
 xmrstak/backend/nvidia/config.tpl             |   3 +
 xmrstak/backend/nvidia/jconf.cpp              |  14 ++-
 xmrstak/backend/nvidia/jconf.hpp              |   1 +
 xmrstak/backend/nvidia/minethd.cpp            |   1 +
 .../backend/nvidia/nvcc_code/cryptonight.hpp  |   1 +
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 109 ++++++++++++++----
 7 files changed, 105 insertions(+), 25 deletions(-)

diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 12468093c..6354f60f0 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -96,6 +96,7 @@ class autoAdjust
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
+					"    \"comp_mode\" : true,\n" +
 					"  },\n";
 			}
 		}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 144da80b9..e2a76d90f 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -16,6 +16,9 @@ R"===(// generated by XMRSTAK_VERSION
  *                 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu
  *                 2 = cudaDeviceScheduleYield
  *                 3 = cudaDeviceScheduleBlockingSync (default)
+ * comp_mode     - Compatibility if true it will use 64bit memory loads and if false it will use
+ *                               128bit memory loads (can produce invalid results)
+ *                               (this option has only a meaning for cryptonight_v8 and monero)
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
  * you can try to tweak it from there to get the best performance.
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index c9d4f194c..b1059f359 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode;
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *compMode;
 	gid = GetObjectMember(oThdConf, "index");
 	blocks = GetObjectMember(oThdConf, "blocks");
 	threads = GetObjectMember(oThdConf, "threads");
@@ -131,9 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	bsleep = GetObjectMember(oThdConf, "bsleep");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	syncMode = GetObjectMember(oThdConf, "sync_mode");
+	compMode = GetObjectMember(oThdConf, "comp_mode");
 
 	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
-		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr)
+		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr ||
+		compMode == nullptr)
 	{
 		return false;
 	}
@@ -161,13 +163,19 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)");
 		return false;
 	}
+
+	if(!compMode->IsBool())
+		return false;
+
+
 	cfg.id = gid->GetInt();
 	cfg.blocks = blocks->GetInt();
 	cfg.threads = threads->GetInt();
 	cfg.bfactor = bfactor->GetInt();
 	cfg.bsleep = bsleep->GetInt();
 	cfg.syncMode = syncMode->GetInt();
-
+	cfg.compMode = compMode->GetBool();
+	
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
 	else
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index b4ebaa035..5ee1f8133 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -29,6 +29,7 @@ class jconf
 		bool bNoPrefetch;
 		int32_t cpu_aff;
 		int syncMode;
+		bool compMode;
 
 		long long iCpuAff;
 	};
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 0153eed19..135f26ea9 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -78,6 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.device_bfactor = (int)cfg.bfactor;
 	ctx.device_bsleep = (int)cfg.bsleep;
 	ctx.syncMode = cfg.syncMode;
+	ctx.compMode = cfg.compMode;
 	this->affinity = cfg.cpu_aff;
 
 	std::future<void> numa_guard = numa_promise.get_future();
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index d588641b4..8167395e3 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -16,6 +16,7 @@ typedef struct {
 	int device_bfactor;
 	int device_bsleep;
 	int syncMode;
+	bool compMode;
 
 	uint32_t *d_input;
 	uint32_t inputlen;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 8e69c2029..1c9c9df64 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <bitset>
 
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
@@ -254,8 +255,12 @@ struct u64 : public uint2
 	}
 };
 
-
-template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
+/** cryptonight with two threads per hash
+ *
+ * @tparam COMP_MODE if true than 64bit memory transfers per thread will be used to store/load data within shared memory
+ *                   else 128bit operations will be used
+ */
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, bool COMP_MODE>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 2 )
 #endif
@@ -329,7 +334,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	{
 		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
 
-		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
+		if(COMP_MODE)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				myChunks[x + sub] = ptr0[ x + sub ];
+			}
+		}
+		else
+			((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
 
 		uint32_t idx1 = (idx0 & 0x30) >> 3;
 
@@ -358,13 +372,31 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		}
 
 		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
-		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
+		if(COMP_MODE)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				ptr0[ x + sub ] = myChunks[x + sub];
+			}
+		}
+		else
+			((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
 		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
 
-		((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
+		if(COMP_MODE)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				myChunks[x + sub] = ptr0[ x + sub ];
+			}
+		}
+		else
+			((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
 
 		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
@@ -420,7 +452,16 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			bx0 = cx_aes;
 		}
 		myChunks[ idx1 + sub ] = ax0;
-		((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
+		if(COMP_MODE)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				ptr0[ x + sub ] = myChunks[x + sub];
+			}
+		}
+		else
+			((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
 		ax0 ^= c;
 		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 	}
@@ -699,7 +740,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO>
+template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, bool COMP_MODE>
 void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 {
 	dim3 grid( ctx->device_blocks );
@@ -741,7 +782,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 			CUDA_CHECK_MSG_KERNEL(
 				ctx->device_id,
 				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO><<<
+				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, COMP_MODE><<<
 					grid,
 					block2,
 					sizeof(uint64_t) * block2.x * 8 +
@@ -807,26 +848,50 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 	}
 }
 
-typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
-
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
+	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
+	
 	if(miner_algo == invalid_algo) return;
 	
 	static const cuda_hash_fn func_table[] = {
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8>
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, true>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, false>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, true>
 	};
 
-	cuda_hash_fn selected_function = func_table[ miner_algo - 1u ];
+	std::bitset<1> digit;
+	digit.set(0, ctx->compMode);
+
+	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
 	selected_function(ctx, startNonce);
 }

From 58b7c66c06519f84328a7553459f99c77446b2f7 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 8 Oct 2018 22:16:49 +0200
Subject: [PATCH 68/77] improve error message

Add a suggestion to an common line which is shown in the event of an crash under windows.
---
 xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index f136744d4..433e175dd 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -415,7 +415,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	}
 
 	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK_MSG(
+		ctx->device_id,
+		"\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**",
+		cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )
+	);
 
 	/* There is only a 32bit limit for the counter on the device side
 	 * therefore this value can be greater than 10, in that case limit rescount

From b9eed59febf2ce7ed914382119559cd320c0e3a9 Mon Sep 17 00:00:00 2001
From: fireice-uk <fireice-uk@users.noreply.github.com>
Date: Tue, 9 Oct 2018 20:58:33 +0100
Subject: [PATCH 69/77] Add Ryo sponsorship message

---
 README.md                 |  5 ++++-
 xmrstak/cli/cli-miner.cpp | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2fe1bc511..046a930e1 100644
--- a/README.md
+++ b/README.md
@@ -46,9 +46,12 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Intense](https://intensecoin.com)
 - [Masari](https://getmasari.org)
 - [QRL](https://theqrl.org)
-- [Ryo](https://ryo-currency.com)
+- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
 - [TurtleCoin](https://turtlecoin.lol)
 
+Ryo currency is a way for us to implement the ideas that we were unable to in
+Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
+
 If your prefered coin is not listed, you can choose one of the following algorithms:
 
 - 1MiB scratchpad memory
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index 428952b1b..171e6dec9 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -783,11 +783,24 @@ int main(int argc, char *argv[])
 	char buffer[64];
 	snprintf(buffer, sizeof(buffer), "\nConfigurable dev donation level is set to %.1f%%\n\n", fDevDonationLevel * 100.0);
 	printer::inst()->print_str(buffer);
+	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_str("You can use following keys to display reports:\n");
 	printer::inst()->print_str("'h' - hashrate\n");
 	printer::inst()->print_str("'r' - results\n");
 	printer::inst()->print_str("'c' - connection\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
+	printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n");
+	printer::inst()->print_str("   #####   ______               _____\n");
+	printer::inst()->print_str(" ##     ## | ___ \\             /  __ \\\n");
+	printer::inst()->print_str("#    _    #| |_/ /_   _   ___  | /  \\/ _   _  _ _  _ _  ___  _ __    ___  _   _\n");
+	printer::inst()->print_str("#   |_|   #|    /| | | | / _ \\ | |    | | | || '_|| '_|/ _ \\| '_ \\  / __|| | | |\n");
+	printer::inst()->print_str("#         #| |\\ \\| |_| || (_) || \\__/\\| |_| || |  | | |  __/| | | || (__ | |_| |\n");
+	printer::inst()->print_str(" ##     ## \\_| \\_|\\__, | \\___/  \\____/ \\__,_||_|  |_|  \\___||_| |_| \\___| \\__, |\n");
+	printer::inst()->print_str("   #####           __/ |                                                   __/ |\n");
+	printer::inst()->print_str("                  |___/    https://ryo-currency.com                       |___/\n\n");
+	printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n");
+	printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n");
+	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str());
 
 	if(params::inst().benchmark_block_version >= 0)

From ed2168b48d16a9870cbef067d38a5ad16b26c9f9 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 10 Oct 2018 11:52:40 +0200
Subject: [PATCH 70/77] CUDA: fix invalid results

If `comp_mode` is false the results on a windows platform will be invalid.
The reason for that is that `ulong4` is in windows 16byte and in linux 32byte.

thx @xmrig for finding and solving the issue

fix #1873
---
 xmrstak/backend/nvidia/autoAdjust.hpp         | 2 +-
 xmrstak/backend/nvidia/config.tpl             | 2 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 6354f60f0..27783acd1 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -96,7 +96,7 @@ class autoAdjust
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
-					"    \"comp_mode\" : true,\n" +
+					"    \"comp_mode\" : false,\n" +
 					"  },\n";
 			}
 		}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index e2a76d90f..8803f6ff2 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -17,7 +17,7 @@ R"===(// generated by XMRSTAK_VERSION
  *                 2 = cudaDeviceScheduleYield
  *                 3 = cudaDeviceScheduleBlockingSync (default)
  * comp_mode     - Compatibility if true it will use 64bit memory loads and if false it will use
- *                               128bit memory loads (can produce invalid results)
+ *                               256bit memory loads (can produce invalid results)
  *                               (this option has only a meaning for cryptonight_v8 and monero)
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 1c9c9df64..3dce3e4ac 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -343,7 +343,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
 		uint32_t idx1 = (idx0 & 0x30) >> 3;
 
@@ -381,7 +381,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
@@ -396,7 +396,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)myChunks)[sub] = ((ulong4*)ptr0)[sub];
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
 		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
@@ -461,7 +461,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 		}
 		else
-			((ulong4*)ptr0)[sub] = ((ulong4*)myChunks)[sub];
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
 		ax0 ^= c;
 		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 	}

From b4387ac00dd6eec6ee1bef4736f02e646fa51428 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 10 Oct 2018 12:04:30 +0200
Subject: [PATCH 71/77] fix right bitshift in `amd_bitalign`

In the current implementation the bit align is using signed integer which results in pulling in
ones in the case the sign bit is set.

- cast to unsigned integer before using bitshift
---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 7689b5d3a..9c9bcd08e 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -35,8 +35,8 @@ R"===(
 inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
 {
 	uint2 result;
-	result.s0 =  (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2));
-	result.s1 =  (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2));
+	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
+	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
 	return result;
 }
 #endif

From bd4a4c94290f23bb38a4163baa3582c99eb84513 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 10 Oct 2018 17:35:45 +0200
Subject: [PATCH 72/77] NVIDIA: rename config option `comp_mode`

The name `comp_mode` for a memoy load pattern if a bad choosen name.
Therefore I changed it to `mem_mode` which also gives use the possibility
to add new mode later if needed.

- rename `comp_mode` to `mem_mode`
- fix documentation
---
 xmrstak/backend/nvidia/autoAdjust.hpp         |  2 +-
 xmrstak/backend/nvidia/config.tpl             |  8 +--
 xmrstak/backend/nvidia/jconf.cpp              | 15 +++--
 xmrstak/backend/nvidia/jconf.hpp              |  2 +-
 xmrstak/backend/nvidia/minethd.cpp            |  2 +-
 .../backend/nvidia/nvcc_code/cryptonight.hpp  |  2 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 64 +++++++++----------
 7 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 27783acd1..2755e03d2 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -96,7 +96,7 @@ class autoAdjust
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
-					"    \"comp_mode\" : false,\n" +
+					"    \"mem_mode\" : 1,\n" +
 					"  },\n";
 			}
 		}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 8803f6ff2..8a5982b57 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -16,9 +16,9 @@ R"===(// generated by XMRSTAK_VERSION
  *                 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu
  *                 2 = cudaDeviceScheduleYield
  *                 3 = cudaDeviceScheduleBlockingSync (default)
- * comp_mode     - Compatibility if true it will use 64bit memory loads and if false it will use
- *                               256bit memory loads (can produce invalid results)
- *                               (this option has only a meaning for cryptonight_v8 and monero)
+ * mem_mode      - select the memory access pattern (this option has only a meaning for cryptonight_v8 and monero)
+ *                 0 = 64bit memory loads
+ *                 1 = 256bit memory loads   
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
  * you can try to tweak it from there to get the best performance.
@@ -27,7 +27,7 @@ R"===(// generated by XMRSTAK_VERSION
  * "gpu_threads_conf" :
  * [
  *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
- *       "affine_to_cpu" : false, "sync_mode" : 3,
+ *       "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1
  *     },
  * ],
  * If you do not wish to mine with your nVidia GPU(s) then use:
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index b1059f359..6c443343b 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *compMode;
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *memMode;
 	gid = GetObjectMember(oThdConf, "index");
 	blocks = GetObjectMember(oThdConf, "blocks");
 	threads = GetObjectMember(oThdConf, "threads");
@@ -131,11 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	bsleep = GetObjectMember(oThdConf, "bsleep");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	syncMode = GetObjectMember(oThdConf, "sync_mode");
-	compMode = GetObjectMember(oThdConf, "comp_mode");
+	memMode = GetObjectMember(oThdConf, "mem_mode");
 
 	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
 		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr ||
-		compMode == nullptr)
+		memMode == nullptr)
 	{
 		return false;
 	}
@@ -160,12 +160,15 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 
 	if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3)
 	{
-		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)");
+		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or not a number. ( range: 0 <= sync_mode < 4.)");
 		return false;
 	}
 
-	if(!compMode->IsBool())
+	if(!memMode->IsNumber() || memMode->GetInt() < 0 || memMode->GetInt() > 1)
+	{
+		printer::inst()->print_msg(L0, "Error NVIDIA: mem_mode out of range or not a number. (range: 0 or 1)");
 		return false;
+	}
 
 
 	cfg.id = gid->GetInt();
@@ -174,7 +177,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	cfg.bfactor = bfactor->GetInt();
 	cfg.bsleep = bsleep->GetInt();
 	cfg.syncMode = syncMode->GetInt();
-	cfg.compMode = compMode->GetBool();
+	cfg.memMode = memMode->GetInt();
 	
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index 5ee1f8133..40b72f880 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -29,7 +29,7 @@ class jconf
 		bool bNoPrefetch;
 		int32_t cpu_aff;
 		int syncMode;
-		bool compMode;
+		int memMode;
 
 		long long iCpuAff;
 	};
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 135f26ea9..e82ec91c3 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -78,7 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.device_bfactor = (int)cfg.bfactor;
 	ctx.device_bsleep = (int)cfg.bsleep;
 	ctx.syncMode = cfg.syncMode;
-	ctx.compMode = cfg.compMode;
+	ctx.memMode = cfg.memMode;
 	this->affinity = cfg.cpu_aff;
 
 	std::future<void> numa_guard = numa_promise.get_future();
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 8167395e3..8fda8d401 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -16,7 +16,7 @@ typedef struct {
 	int device_bfactor;
 	int device_bsleep;
 	int syncMode;
-	bool compMode;
+	bool memMode;
 
 	uint32_t *d_input;
 	uint32_t inputlen;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 3dce3e4ac..00a65332a 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -257,10 +257,10 @@ struct u64 : public uint2
 
 /** cryptonight with two threads per hash
  *
- * @tparam COMP_MODE if true than 64bit memory transfers per thread will be used to store/load data within shared memory
- *                   else 128bit operations will be used
+ * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
+ *                   else if `1` 256bit operations will be used
  */
-template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, bool COMP_MODE>
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, uint32_t MEM_MODE>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 2 )
 #endif
@@ -334,7 +334,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	{
 		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
 
-		if(COMP_MODE)
+		if(MEM_MODE == 0)
 		{
 			#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
@@ -372,7 +372,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		}
 
 		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
-		if(COMP_MODE)
+		if(MEM_MODE == 0)
 		{
 			#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
@@ -387,7 +387,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		idx1 = (idx0 & 0x30) >> 3;
 		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
 
-		if(COMP_MODE)
+		if(MEM_MODE == 0)
 		{
 			#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
@@ -452,7 +452,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			bx0 = cx_aes;
 		}
 		myChunks[ idx1 + sub ] = ax0;
-		if(COMP_MODE)
+		if(MEM_MODE == 0)
 		{
 			#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
@@ -740,7 +740,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, bool COMP_MODE>
+template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
 void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 {
 	dim3 grid( ctx->device_blocks );
@@ -782,7 +782,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 			CUDA_CHECK_MSG_KERNEL(
 				ctx->device_id,
 				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, COMP_MODE><<<
+				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, MEM_MODE><<<
 					grid,
 					block2,
 					sizeof(uint64_t) * block2.x * 8 +
@@ -855,42 +855,42 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	if(miner_algo == invalid_algo) return;
 	
 	static const cuda_hash_fn func_table[] = {
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, true>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, false>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, true>
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>
 	};
 
 	std::bitset<1> digit;
-	digit.set(0, ctx->compMode);
+	digit.set(0, ctx->memMode == 1);
 
 	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
 	selected_function(ctx, startNonce);

From 5a80f50a629ff487cd557384f39082af56f79532 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 10 Oct 2018 17:41:39 +0200
Subject: [PATCH 73/77] update version to 2.5.0

---
 xmrstak/version.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index cd8bb52f5..80c25c6b8 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.4.7"
+#define XMR_STAK_VERSION "2.5.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"

From b1504b36e756269fc47cbf9ad9a959ce3d9ccba7 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Wed, 10 Oct 2018 20:51:59 +0200
Subject: [PATCH 74/77] NVIDIA: tweak `get_reciprocal`

- remove helper array to perform division
- tweak `get_reciprocal`
---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 13 +--
 .../nvcc_code/cuda_fast_int_math_v2.hpp       | 82 ++++---------------
 2 files changed, 19 insertions(+), 76 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 00a65332a..7742e740e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -271,17 +271,6 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 
 	cn_aes_gpu_init( sharedMemory );
 
-	uint32_t* RCP;
-	if(ALGO == cryptonight_monero_v8)
-	{
-		__shared__ uint32_t RCP_shared[256];
-		for (int i = threadIdx.x; i < 256; i += blockDim.x)
-		{
-			RCP_shared[i] = RCP_C[i];
-		}
-		RCP = RCP_shared;
-	}
-
 #if( __CUDA_ARCH__ < 300 )
 	extern __shared__ uint64_t externShared[];
 	// 8 x 64bit values
@@ -413,7 +402,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			((uint64_t*)myChunks)[ idx1 ] ^= division_result;
 
 			const uint32_t dd = (static_cast<uint32_t>(cx_mul) + (sqrt_result << 1)) | 0x80000001UL;
-			division_result = fast_div_v2(RCP, cx_aes, dd);
+			division_result = fast_div_v2(cx_aes, dd);
 
 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(cx_mul + division_result);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index e3220230a..796b7adda 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -2,80 +2,35 @@
 
 #include <stdint.h>
 
-static __constant__ const uint32_t RCP_C[256] =
+__device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
 {
-	0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu,
-	0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au,
-	0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu,
-	0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u,
-	0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u,
-	0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u,
-	0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u,
-	0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u,
-	0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u,
-	0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u,
-	0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u,
-	0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u,
-	0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au,
-	0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u,
-	0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u,
-	0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu,
-	0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u,
-	0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u,
-	0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u,
-	0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u,
-	0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu,
-	0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u,
-	0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u,
-	0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u,
-	0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u,
-	0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu,
-	0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau,
-	0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u,
-	0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u,
-	0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu,
-	0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u,
-	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
-};
+	const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23));
+	const float a_lo = __uint2float_rn(a & 0xFF);
 
-__device__ __forceinline__ uint32_t get_reciprocal(const uint32_t* RCP, uint32_t a)
-{
-	const uint32_t index1 = (a & 0x7F000000U) >> 23;
-	const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768;
-
-	const uint32_t r1 = RCP[index1];
-	uint32_t r2_0 = RCP[index1 + 1];
-	if (index2 > 0) r2_0 >>= 16;
-	const int r2 = r2_0 & 0xFFFFU;
-
-	const uint32_t r = r1 - (uint32_t)(__mul24(r2, index2) >> 6);
-
-	const uint64_t lo0 = (uint64_t)(r) * a;
-	uint64_t lo = lo0 + ((uint64_t)(a) << 32);
+	float r;
+	asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi));
+	const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23));
 
-	a >>= 1;
-	const bool b = (a >= lo) || (lo >= lo0);
-	lo = a - lo;
-
-	const uint64_t k = __umulhi((uint32_t)lo, r) + ((uint64_t)(r) * ((uint32_t*)&lo)[1]) + lo;
-	return ((uint32_t*)&k)[1] + (b ? r : 0);
+	const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f));
+	return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled);
 }
 
-__device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a, uint32_t b)
+__device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
 {
-	const uint32_t r = get_reciprocal(RCP, b);
-	const uint64_t k = __umulhi((uint32_t)a, r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
+	const uint32_t r = get_reciprocal(b);
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
 
 	uint32_t q[2];
 	q[0] = ((uint32_t*)&k)[1];
-	q[1] = (k < a) ? 1 : 0;
 
-	const int64_t tmp = a - *((uint64_t*)(q)) * b;
-	const uint32_t overshoot = (tmp < 0) ? 1u : 0U;
-	const uint32_t undershoot = (tmp >= b) ? 1u : 0U;
+	int64_t tmp = a - (uint64_t)(q[0]) * b;
+	((int32_t*)(&tmp))[1] -= (k < a) ? b : 0;
+
+	const bool overshoot = ((int32_t*)(&tmp))[1] < 0;
+	const bool undershoot = tmp >= b;
 
-	q[0] += undershoot - overshoot;
-	q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U);
+	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
+	q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U);
 
 	return *((uint64_t*)(q));
 }
@@ -102,6 +57,5 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
 	const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0;
 	const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0;
 	result += (overshoot+undershoot);
-
 	return result;
 }

From bf882d44a4a341d7dcda7717f095a10f8a954fea Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 10 Oct 2018 21:38:15 +0200
Subject: [PATCH 75/77] update documantion

- update tuning guide
- update miner usage
---
 doc/tuning.md | 39 +++++++++++++++++++++++++++++++++------
 doc/usage.md  | 28 ++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/doc/tuning.md b/doc/tuning.md
index 6bf036e9f..2673d68d9 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -9,6 +9,7 @@
 * [AMD Backend](#amd-backend)
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
+  * [Two Threads per GPU](two-threads-per-gpu)
   * [disable comp_mode](#disable-comp_mode)
   * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
   * [Increase Memory Pool](#increase-memory-pool)
@@ -55,10 +56,10 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`.
 "gpu_threads_conf" :
 [
     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
-      "affine_to_cpu" : false, "sync_mode" : 3,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
     },
     { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
-      "affine_to_cpu" : false, "sync_mode" : 3,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
     },
 ],
 ```
@@ -82,11 +83,37 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
 ```
 "gpu_threads_conf" :
 [
-    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+    {
+      "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
     },
-    { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+    {
+      "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+],
+
+"platform_index" : 0,
+```
+
+### Two Threads per GPU
+
+Some GPUs like AMD Vega can mine faster if two threads are using the same GPU.
+Use the auto generated config as base and repeat the config entry for a GPU.
+If the attribute `index` is used twice than two threads will use one GPU.
+Take care that the required memory usage on the GPU will also double.
+Therefore adjust your intensity by hand.
+
+```
+"gpu_threads_conf" :
+[
+    {
+      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+    {
+      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
     },
 ],
 
diff --git a/doc/usage.md b/doc/usage.md
index 886c1b319..a371f0e67 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -5,6 +5,7 @@
 * [Usage on Windows](#usage-on-windows)
 * [Usage on Linux](#usage-on-linux)
 * [Command Line Options](#command-line-options)
+* [Use different backends](#use-different-backends)
 * [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton)
 
 ## Configurations
@@ -34,6 +35,33 @@ Note: If the pool is ignoring the option `rig_id` in `pools.txt` to name your wo
 The miner allow to overwrite some of the settings via command line options.
 Run `xmr-stak --help` to show all available command line options.
 
+## Use Different Backends
+
+On linux and OSX please add `./` before the binary name `xmr-stak`.
+
+### CPU Only:
+```
+xmr-stak --noAMD --noNVIDIA
+```
+
+### NVIDIA/AMD Only:
+
+The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD GPUs) is available.
+
+```
+xmr-stak --noCPU
+```
+**CUDA** is currently not supported. I am currently try to get some performance out it.
+
+### NVIDIA via OpenCL
+
+It is possible to use the OpenCl backend which is originally created for AMD GPUs with NVIDIA GPus.
+Some NVIDIA GPUs can reach better performance with this backend.
+
+```
+xmr-stak --openCLVendor NVIDIA --noNVIDIA
+```
+
 ## Docker image usage
 
 You can run the Docker image the following way:

From 074a9d208b87e2b4a1205c8bb7e04260e14d81b1 Mon Sep 17 00:00:00 2001
From: fireice-uk <fireice-uk@users.noreply.github.com>
Date: Thu, 11 Oct 2018 15:16:35 +0100
Subject: [PATCH 76/77] fix logo on Windows

---
 xmrstak/cli/cli-miner.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index 171e6dec9..40fb9d948 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -790,14 +790,14 @@ int main(int argc, char *argv[])
 	printer::inst()->print_str("'c' - connection\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n");
-	printer::inst()->print_str("   #####   ______               _____\n");
-	printer::inst()->print_str(" ##     ## | ___ \\             /  __ \\\n");
-	printer::inst()->print_str("#    _    #| |_/ /_   _   ___  | /  \\/ _   _  _ _  _ _  ___  _ __    ___  _   _\n");
-	printer::inst()->print_str("#   |_|   #|    /| | | | / _ \\ | |    | | | || '_|| '_|/ _ \\| '_ \\  / __|| | | |\n");
-	printer::inst()->print_str("#         #| |\\ \\| |_| || (_) || \\__/\\| |_| || |  | | |  __/| | | || (__ | |_| |\n");
-	printer::inst()->print_str(" ##     ## \\_| \\_|\\__, | \\___/  \\____/ \\__,_||_|  |_|  \\___||_| |_| \\___| \\__, |\n");
-	printer::inst()->print_str("   #####           __/ |                                                   __/ |\n");
-	printer::inst()->print_str("                  |___/    https://ryo-currency.com                       |___/\n\n");
+	printer::inst()->print_str("   #####   ______               ____\n");
+	printer::inst()->print_str(" ##     ## | ___ \\             /  _ \\\n");
+	printer::inst()->print_str("#    _    #| |_/ /_   _   ___  | / \\/ _   _  _ _  _ _  ___  _ __    ___  _   _\n");
+	printer::inst()->print_str("#   |_|   #|    /| | | | / _ \\ | |   | | | || '_|| '_|/ _ \\| '_ \\  / __|| | | |\n");
+	printer::inst()->print_str("#         #| |\\ \\| |_| || (_) || \\_/\\| |_| || |  | | |  __/| | | || (__ | |_| |\n");
+	printer::inst()->print_str(" ##     ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_|  |_|  \\___||_| |_| \\___| \\__, |\n");
+	printer::inst()->print_str("   #####           __/ |                                                  __/ |\n");
+	printer::inst()->print_str("                  |___/   https://ryo-currency.com                       |___/\n\n");
 	printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n");
 	printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");

From 732b0e4115cf882d5c17479d36b9b37fa8fcdce1 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Thu, 11 Oct 2018 10:30:51 +0200
Subject: [PATCH 77/77] NVIDIA: support for multiple CUDA libs

Allow to ship the miner with multiple cuda backends those depends on different driver versions.
This will allow to support Turing/Volta and old Fermi GPU within one release.

- add support to search for the first working CUDA backend
- add some more messages to support better debugging (if a user has some issues)
---
 xmrstak/backend/backendConnector.cpp | 46 +++++++++++++++++++++++-----
 xmrstak/backend/nvidia/minethd.cpp   |  4 +++
 xmrstak/backend/plugin.hpp           | 31 +++++++++++--------
 3 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index 525413fd5..92bb01506 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -63,10 +63,35 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 #ifndef CONF_NO_CUDA
 	if(params::inst().useNVIDIA)
 	{
-		plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend");
-		std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
-		if(nvidiaThreads->size() == 0)
+		plugin nvidiaplugin;
+		std::vector<iBackend*>* nvidiaThreads;
+		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
+		size_t numWorkers = 0u;
+
+		for( const auto & name : libNames)
+		{
+			printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
+			nvidiaplugin.load("NVIDIA", name);
+			std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
+			if(nvidiaThreads != nullptr)
+			{
+				pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
+				numWorkers = nvidiaThreads->size();
+				delete nvidiaThreads;
+			}
+			else
+			{
+				// remove the plugin if we have found no GPUs
+				nvidiaplugin.unload();
+			}
+			// we found at leat one working GPU
+			if(numWorkers != 0)
+			{
+				printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str());
+				break;
+			}
+		}
+		if(numWorkers == 0)
 			printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled.");
 	}
 #endif
@@ -75,10 +100,17 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 	if(params::inst().useAMD)
 	{
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		plugin amdplugin(backendName, "xmrstak_opencl_backend");
+		plugin amdplugin;
+		amdplugin.load(backendName, "xmrstak_opencl_backend");
 		std::vector<iBackend*>* amdThreads = amdplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
-		if(amdThreads->size() == 0)
+		size_t numWorkers = 0u;
+		if(amdThreads != nullptr)
+		{
+			pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
+			numWorkers = amdThreads->size();
+			delete amdThreads;
+		}
+		if(numWorkers == 0)
 			printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str());
 	}
 #endif
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index e82ec91c3..6460628de 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -165,6 +165,10 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
 		return pvThreads;
 	}
+	else
+	{
+		std::cout<<"NVIDIA: found "<< deviceCount <<" potential device's"<<std::endl;
+	}
 
 	size_t i, n = jconf::inst()->GetGPUThreadCount();
 	pvThreads->reserve(n);
diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp
index 1811af224..5c7dfe16a 100644
--- a/xmrstak/backend/plugin.hpp
+++ b/xmrstak/backend/plugin.hpp
@@ -27,8 +27,11 @@ namespace xmrstak
 struct plugin
 {
 
-	plugin(const std::string backendName, const std::string libName) : fn_startBackend(nullptr), m_backendName(backendName)
+	plugin() = default;
+
+	void load(const std::string backendName, const std::string libName)
 	{
+		m_backendName = backendName;
 #ifdef WIN32
 		libBackend = LoadLibrary(TEXT((libName + ".dll").c_str()));
 		if(!libBackend)
@@ -81,32 +84,36 @@ struct plugin
 		if(fn_startBackend == nullptr)
 		{
 			std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
-			std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl;
 			return pvThreads;
 		}
 
 		return fn_startBackend(threadOffset, pWork, env);
 	}
 
+	void unload()
+	{
+		if(libBackend)
+		{
+#ifdef WIN32
+			FreeLibrary(libBackend);
+#else
+			dlclose(libBackend);
+#endif
+		}
+		fn_startBackend = nullptr;
+	}
+
 	std::string m_backendName;
 
 	typedef std::vector<iBackend*>* (*startBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env);
 
-	startBackend_t fn_startBackend;
+	startBackend_t fn_startBackend = nullptr;
 
 #ifdef WIN32
 	HINSTANCE libBackend;
 #else
-	void *libBackend;
-#endif
-
-/* \todo add unload to destructor and change usage of plugin that libs kept open until the miner ends
-#ifdef WIN32
-	FreeLibrary(libBackend);
-#else
-	dlclose(libBackend);
+	void *libBackend = nullptr;
 #endif
- * */
 };
 
 } // namespace xmrstak