From c5e0ab5e7d39f13a4c285726000b08f4618b1dcf Mon Sep 17 00:00:00 2001
From: Jacco Bikker <bikker.j@gmail.com>
Date: Wed, 4 Nov 2020 20:25:35 +0100
Subject: [PATCH] Merged experimental lighttree code.

---
 apps/benchmarkapp/camera.xml                  |   6 +-
 lib/CUDA/shared_kernel_code/lights_shared.h   | 163 ++++++++++++++++++
 .../kernels/.cuda.cu                          |   5 +-
 lib/RenderCore_Optix7Filter/kernels/.cuda.cu  |   5 +-
 lib/RenderCore_Optix7Guiding/kernels/.cuda.cu |   5 +-
 .../kernels/pathtracer.h                      |   8 +-
 lib/RenderCore_OptixPrime_B/kernels/.cuda.cu  |   5 +-
 .../kernels/pathtracer.h                      |   2 +-
 .../kernels/.cuda.cu                          |   2 +
 .../kernels/.cuda.cu                          |   2 +
 lib/RenderCore_PrimeAdaptive/kernels/.cuda.cu |   5 +-
 lib/rendercore_optix7/core_settings.h         |   1 +
 lib/rendercore_optix7/kernels/.cuda.cu        |   5 +-
 lib/rendercore_optix7/kernels/pathtracer.h    |   2 +
 lib/rendercore_optix7/rendercore.cpp          | 116 +++++++++++++
 lib/rendercore_optix7/rendercore.h            |   4 +
 16 files changed, 322 insertions(+), 14 deletions(-)
diff --git a/apps/benchmarkapp/camera.xml b/apps/benchmarkapp/camera.xml
index 37a06cf95..967a04b38 100644
--- a/apps/benchmarkapp/camera.xml
+++ b/apps/benchmarkapp/camera.xml
@@ -1,12 +1,12 @@
 <camera>
-    <transform m00="0.98938847" m01="-0.022192042" m02="-0.14359" m03="-2.8366079" m10="0" m11="0.98826677" m12="-0.15273803" m13="0.45841199" m20="-0.1452948" m21="-0.15111725" m22="-0.97777963" m23="30.572617" m30="0" m31="0" m32="0" m33="1"/>
+    <transform m00="-0.58767647" m01="-0.25613666" m02="-0.76748312" m03="9.9049749" m10="0" m11="0.9485687" m12="-0.31657138" m13="10.969443" m20="-0.80909604" m21="0.18604155" m22="0.55745149" m23="-17.508549" m30="0" m31="0" m32="0" m33="1"/>
     <FOV>40</FOV>
     <brightness>0</brightness>
     <contrast>0</contrast>
     <gamma>2.2</gamma>
-    <aperture>0.0089872945</aperture>
+    <aperture>0.0089999996</aperture>
     <distortion>0.050000001</distortion>
-    <focalDistance>5.0255618</focalDistance>
+    <focalDistance>21.969528</focalDistance>
     <clampValue>2.5</clampValue>
     <tonemapper>3</tonemapper>
 </camera>
diff --git a/lib/CUDA/shared_kernel_code/lights_shared.h b/lib/CUDA/shared_kernel_code/lights_shared.h
index 9c87793b3..ec551a033 100644
--- a/lib/CUDA/shared_kernel_code/lights_shared.h
+++ b/lib/CUDA/shared_kernel_code/lights_shared.h
@@ -22,6 +22,7 @@
 #include "noerrors.h"
 
 #define ISLIGHTS
+// #define LIGHTTREE
 #define MAXISLIGHTS	64
 
 #define TRILIGHTCOUNT			(lightCounts.x & 0xffff)
@@ -112,6 +113,59 @@ LH2_DEVFUNC float CalculateLightPDF( const float3& D, const float t, const float
 	return (t * t) / (abs( dot( D, lightNormal ) ) * lightArea);
 }
 
+//  +-----------------------------------------------------------------------------+
+//  |  CalculateChildNodeWeights                                                  |
+//  |  Helper to compute child node weights.                                LH2'20|
+//  +-----------------------------------------------------------------------------+
+LH2_DEVFUNC float CalculateChildNodeWeights( const int node, const float3& I, const float3& N, uint& seed, const bool debug = false )
+{
+	const int left = lightTree[node].left;
+	const int right = lightTree[node].right;
+	const float3 b1j = make_float3( lightTree[left].bmin ), b1k = make_float3( lightTree[right].bmin );
+	const float3 b2j = make_float3( lightTree[left].bmax ), b2k = make_float3( lightTree[right].bmax );
+	const float3 diag_j = b2j - b1j;
+	const float3 diag_k = b2k - b1k;
+	const float3 LN = make_float3( lightTree[node].N );
+	// calculate (squared) minimum and maximum distance from I to aabb
+	// method: GPU-Accelerated Minimum Distance and Clearance Queries, Krishnamurthy et al., 2011
+	const float3 Bj = 0.5f * diag_j;
+	const float3 Bk = 0.5f * diag_k;
+	const float3 Cj = (b1j + b2j) * 0.5f;
+	const float3 Ck = (b1k + b2k) * 0.5f;
+	const float3 Dj = Cj - I;
+	const float3 Dk = Ck - I;
+	const float3 min_j = make_float3( max( Dj.x - Bj.x, 0.0f ), max( Dj.y - Bj.y, 0.0f ), max( Dj.z - Bj.z, 0.0f ) );
+	const float3 min_k = make_float3( max( Dk.x - Bk.x, 0.0f ), max( Dk.y - Bk.y, 0.0f ), max( Dk.z - Bk.z, 0.0f ) );
+	const float dist2j = dot( min_j, min_j );
+	const float dist2k = dot( min_k, min_k );
+	const float3 max_j = Dj + Bj;
+	const float3 max_k = Dk + Bk;
+	const float dist2j_max = dot( max_j, max_j );
+	const float dist2k_max = dot( max_k, max_k );
+	// get the left and right cluster intensities
+	const float Ij = lightTree[left].intensity;
+	const float Ik = lightTree[right].intensity;
+	// get a reasonable value for F using the normals at I and the light
+	const float3 Rj = b1j + (b2j - b1j) * make_float3( RandomFloat( seed ), RandomFloat( seed ), RandomFloat( seed ) );
+	const float3 Rk = b1k + (b2k - b1k) * make_float3( RandomFloat( seed ), RandomFloat( seed ), RandomFloat( seed ) );
+	const float3 Lj = normalize( Rj - I );
+	const float3 Lk = normalize( Rk - I );
+	float Fj = max( 0.001f, dot( N, Lj ) );
+	float Fk = max( 0.001f, dot( N, Lk ) );
+	if (dot( LN, LN ) > 0.001f)
+		Fj *= max( 0.001f, dot( LN, Lj * -1.0f ) ),
+		Fk *= max( 0.001f, dot( LN, Lk * -1.0f ) );
+	// calculate final probabilities according to the realtime stochastic lightcuts paper
+	const bool insideBoth = dist2j == 0 && dist2k == 0;
+	const float wmin_j = (Fj * Ij) / (insideBoth ? 1 : max( 0.0001f, dist2j) );
+	const float wmin_k = (Fk * Ik) / (insideBoth ? 1 : max( 0.0001f, dist2k) );
+	const float wmax_j = (Fj * Ij) / max( 0.0001f, dist2j_max );
+	const float wmax_k = (Fj * Ij) / max( 0.0001f, dist2k_max );
+	const float pmin_j = wmin_j / (wmin_j + wmin_k);
+	const float pmax_j = wmax_j / (wmax_j + wmax_k);
+	return 0.5f * (pmin_j + pmax_j);
+}
+
 //  +-----------------------------------------------------------------------------+
 //  |  LightPickProb                                                              |
 //  |  Calculates the probability with which the specified light woukd be picked  |
@@ -135,6 +189,33 @@ LH2_DEVFUNC float LightPickProb( int idx, const float3& O, const float3& N, cons
 #endif
 }
 
+//  +-----------------------------------------------------------------------------+
+//  |  LightPickProbLTree                                                         |
+//  |  Calculates the probability with which the specified light woukd be picked  |
+//  |  from the specified world space location and normal using the stochastic    |
+//  |  lightcuts approach.                                                  LH2'20|
+//  +-----------------------------------------------------------------------------+
+LH2_DEVFUNC float LightPickProbLTree( int idx, const float3& O, const float3& N, const float3& I, uint& seed )
+{
+#ifndef LIGHTTREE
+	return LightPickProb( idx, O, N, I );
+#else
+	LightCluster* tree = lightTree;
+	int node = idx + 1; // leaf for light i is at index i + 1, see UpdateLightTree in rendercore.cpp.
+	float pickProb = 1;
+	while (1)
+	{
+		if (node == 0) break; // we are the root node
+		// determine probability of selecting the current node over its sibling
+		int parent = __float_as_int( tree[node].N.w /* we abused N.w to store the parent node index */ );
+		const float p = CalculateChildNodeWeights( parent, I, N, seed );
+		if (tree[parent].left == node) pickProb *= p /* we are the left child */; else pickProb *= 1 - p;
+		node = parent;
+	}
+	return pickProb;
+#endif
+}
+
 //  +-----------------------------------------------------------------------------+
 //  |  RandomPointOnLight                                                         |
 //  |  Selects a random point on a random light. Returns a position, a normal on  |
@@ -231,6 +312,88 @@ LH2_DEVFUNC float3 RandomPointOnLight( float r0, float r1, const float3& I, cons
 	}
 }
 
+//  +-----------------------------------------------------------------------------+
+//  |  RandomPointOnLightLTree                                                    |
+//  |  Selects a random point on a random light, using the stochastic lightcuts   |
+//  |  approach, via a binary light tree. Default method for the Optix7 core.     |
+//  |  Returns a position, a normal on the light source, the pick probability,    |
+//  |  and the importance of the explicit connection.                       LH2'20|
+//  +-----------------------------------------------------------------------------+
+LH2_DEVFUNC float3 RandomPointOnLightLTree( float r0, float r1, uint& seed, const float3& I, const float3& N, float& pickProb, float& lightPdf, float3& lightColor, const bool debug = false )
+{
+#ifndef LIGHTTREE
+	return RandomPointOnLight( r0, r1, I, N, pickProb, lightPdf, lightColor );
+#else
+	LightCluster* tree = lightTree;
+	int node = 0; // index of root node
+	int lightIdx = 0;
+	pickProb = 1;
+	while (1)
+	{
+		if (tree[node].left == -1)
+		{
+			// reached a leaf, use this light
+			lightIdx = tree[node].light;
+			break;
+		}
+		// interior node; randomly pick a child
+		const float p_j = CalculateChildNodeWeights( node, I, N, seed, debug );
+		// decide
+		if (r1 < p_j)
+			node = tree[node].left, r1 *= 1.0f / p_j, pickProb *= p_j;
+		else
+			node = tree[node].right, r1 = (r1 - p_j) / (1 - p_j), pickProb *= 1 - p_j;
+	}
+	if (lightIdx & (1 << 30))
+	{
+		// pick a pointlight
+		const CorePointLight4& light = (const CorePointLight4&)pointLights[lightIdx - (1 << 30)];
+		const float3 P = make_float3( light.data0 );	// position
+		const float3 L = P - I;
+		const float sqDist = dot( L, L );
+		lightColor = make_float3( light.data1 ) / sqDist;		// radiance
+		lightPdf = dot( L, N ) > 0 ? 1 : 0;
+		return P;
+	}
+	else if (lightIdx & (1 << 29))
+	{
+		// spotlight
+		const CoreSpotLight4& light = (const CoreSpotLight4&)spotLights[lightIdx - (1 << 29)];
+		const float4 V0 = light.data0;				// position + cos_inner
+		const float4 V1 = light.data1;				// radiance + cos_outer
+		const float4 D = light.data2;				// direction
+		const float3 P = make_float3( V0 );
+		float3 L = I - P;
+		const float sqDist = dot( L, L );
+		L = normalize( L );
+		float d = (max( 0.0f, L.x * D.x + L.y * D.y + L.z * D.z ) - V1.w) / (V0.w - V1.w);
+		const float LNdotL = min( 1.0f, d );
+		lightPdf = (LNdotL > 0 && dot( L, N ) < 0) ? (sqDist / LNdotL) : 0;
+		lightColor = make_float3( V1 );
+		return P;
+	}
+	else
+	{
+		// light triangle
+		float3 bary = RandomBarycentrics( r0 );
+		const CoreLightTri4& light = (const CoreLightTri4&)triLights[lightIdx];
+		const float4 V0 = light.data3;				// vertex0
+		const float4 V1 = light.data4;				// vertex1
+		const float4 V2 = light.data5;				// vertex2
+		lightColor = make_float3( light.data2 );	// radiance
+		const float4 LN = light.data1;				// N
+		const float3 P = make_float3( bary.x * V0 + bary.y * V1 + bary.z * V2 );
+		float3 L = I - P;							// reversed: from light to intersection point
+		const float sqDist = dot( L, L );
+		L = normalize( L );
+		const float LNdotL = L.x * LN.x + L.y * LN.y + L.z * LN.z;
+		const float reciSolidAngle = sqDist / (LN.w * LNdotL); // LN.w contains area
+		lightPdf = (LNdotL > 0 && dot( L, N ) < 0) ? reciSolidAngle : 0;
+		return P;
+	}
+#endif
+}
+
 //  +-----------------------------------------------------------------------------+
 //  |  Sample_Le                                                                  |
 //  |  Part of the BDPT core.                                               LH2'19|
diff --git a/lib/RenderCore_Optix7Adaptive/kernels/.cuda.cu b/lib/RenderCore_Optix7Adaptive/kernels/.cuda.cu
index 2cb154d23..5f416b592 100644
--- a/lib/RenderCore_Optix7Adaptive/kernels/.cuda.cu
+++ b/lib/RenderCore_Optix7Adaptive/kernels/.cuda.cu
@@ -34,6 +34,7 @@ __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ PathState* pathStates;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -44,7 +45,7 @@ __constant__ __device__ float clampValue;
 // staging: copies will be batched and carried out after rendering completes, 
 // to allow the CPU to update the scene concurrently with GPU rendering.
 
-enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV };
+enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, LTREE };
 
 // device pointers are not real pointers for nvcc, so we need a bit of a hack.
 
@@ -74,6 +75,7 @@ __host__ static void pushPtrCpy( int id, void* p )
 	if (id == NRMLS) cudaMemcpyToSymbol( nrm32, &p, sizeof( void* ) );
 	if (id == SKYPIX) cudaMemcpyToSymbol( skyPixels, &p, sizeof( void* ) );
 	if (id == DBGDAT) cudaMemcpyToSymbol( debugData, &p, sizeof( void* ) );
+	if (id == LTREE) cudaMemcpyToSymbol( lightTree, &p, sizeof( void* ) );
 }
 __host__ static void pushIntCpy( int id, const int v )
 {
@@ -151,6 +153,7 @@ __host__ void stageWorldToSky( const mat4& worldToLight ) { stageMatCpy( SMAT /*
 __host__ void stageDebugData( float4* p ) { stagePtrCpy( DBGDAT /* debugData */, p ); }
 __host__ void stageGeometryEpsilon( float e ) { stageF32Cpy( GEPS /* geometryEpsilon */, e ); }
 __host__ void stageClampValue( float c ) { stageF32Cpy( CLMPV /* clampValue */, c ); }
+__host__ void stageLightTree( LightCluster* t ) { stagePtrCpy( LTREE /* light tree */, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/RenderCore_Optix7Filter/kernels/.cuda.cu b/lib/RenderCore_Optix7Filter/kernels/.cuda.cu
index 65ae74a6a..0a054c690 100644
--- a/lib/RenderCore_Optix7Filter/kernels/.cuda.cu
+++ b/lib/RenderCore_Optix7Filter/kernels/.cuda.cu
@@ -34,6 +34,7 @@ __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ PathState* pathStates;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -44,7 +45,7 @@ __constant__ __device__ float clampValue;
 // staging: copies will be batched and carried out after rendering completes, 
 // to allow the CPU to update the scene concurrently with GPU rendering.
 
-enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV };
+enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, LTREE };
 
 // device pointers are not real pointers for nvcc, so we need a bit of a hack.
 
@@ -74,6 +75,7 @@ __host__ static void pushPtrCpy( int id, void* p )
 	if (id == NRMLS) cudaMemcpyToSymbol( nrm32, &p, sizeof( void* ) );
 	if (id == SKYPIX) cudaMemcpyToSymbol( skyPixels, &p, sizeof( void* ) );
 	if (id == DBGDAT) cudaMemcpyToSymbol( debugData, &p, sizeof( void* ) );
+	if (id == LTREE) cudaMemcpyToSymbol( lightTree, &p, sizeof( void* ) );
 }
 __host__ static void pushIntCpy( int id, const int v )
 {
@@ -117,6 +119,7 @@ __host__ void stageWorldToSky( const mat4& worldToLight ) { stageMatCpy( SMAT /*
 __host__ void stageDebugData( float4* p ) { stagePtrCpy( DBGDAT /* debugData */, p ); }
 __host__ void stageGeometryEpsilon( float e ) { stageF32Cpy( GEPS /* geometryEpsilon */, e ); }
 __host__ void stageClampValue( float c ) { stageF32Cpy( CLMPV /* clampValue */, c ); }
+__host__ void stageLightTree( LightCluster* t ) { stagePtrCpy( LTREE /* light tree */, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/RenderCore_Optix7Guiding/kernels/.cuda.cu b/lib/RenderCore_Optix7Guiding/kernels/.cuda.cu
index d1de27803..fafe533bf 100644
--- a/lib/RenderCore_Optix7Guiding/kernels/.cuda.cu
+++ b/lib/RenderCore_Optix7Guiding/kernels/.cuda.cu
@@ -36,6 +36,7 @@ __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ PathState* pathStates;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -46,7 +47,7 @@ __constant__ __device__ float clampValue;
 // staging: copies will be batched and carried out after rendering completes, 
 // to allow the CPU to update the scene concurrently with GPU rendering.
 
-enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, GUIDANCE, PMIN, PEXT };
+enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, GUIDANCE, PMIN, PEXT, LTREE };
 
 // device pointers are not real pointers for nvcc, so we need a bit of a hack.
 
@@ -78,6 +79,7 @@ __host__ static void pushPtrCpy( int id, void* p )
 	if (id == NRMLS) cudaMemcpyToSymbol( nrm32, &p, sizeof( void* ) );
 	if (id == SKYPIX) cudaMemcpyToSymbol( skyPixels, &p, sizeof( void* ) );
 	if (id == DBGDAT) cudaMemcpyToSymbol( debugData, &p, sizeof( void* ) );
+	if (id == LTREE) cudaMemcpyToSymbol( lightTree, &p, sizeof( void* ) );
 	if (id == GUIDANCE) cudaMemcpyToSymbol( guidance, &p, sizeof( void* ) );
 }
 __host__ static void pushIntCpy( int id, const int v )
@@ -170,6 +172,7 @@ __host__ void stageWorldToSky( const mat4& worldToLight ) { stageMatCpy( SMAT /*
 __host__ void stageDebugData( float4* p ) { stagePtrCpy( DBGDAT /* debugData */, p ); }
 __host__ void stageGeometryEpsilon( float e ) { stageF32Cpy( GEPS /* geometryEpsilon */, e ); }
 __host__ void stageClampValue( float c ) { stageF32Cpy( CLMPV /* clampValue */, c ); }
+__host__ void stageLightTree( LightCluster* t ) { stagePtrCpy( LTREE /* light tree */, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/RenderCore_Optix7Guiding/kernels/pathtracer.h b/lib/RenderCore_Optix7Guiding/kernels/pathtracer.h
index 26cba752b..918124242 100644
--- a/lib/RenderCore_Optix7Guiding/kernels/pathtracer.h
+++ b/lib/RenderCore_Optix7Guiding/kernels/pathtracer.h
@@ -65,7 +65,7 @@ LH2_DEVFUNC float3 RandomPointOnLightPNEE( float r0, float r1, const float3& I,
 	if (g1.x == 0)
 	{
 		// nothing here; sample lights uniformly
-		lightIdx = (int)(r1 * (TRILIGHTCOUNT + POINTLIGHTCOUNT));
+		lightIdx = min( (int)(r1 * (TRILIGHTCOUNT + POINTLIGHTCOUNT)), TRILIGHTCOUNT + POINTLIGHTCOUNT - 1 );
 		pickProb = 1.0f / (TRILIGHTCOUNT + POINTLIGHTCOUNT);
 	}
 	else
@@ -374,9 +374,9 @@ void shadeKernel( float4* accumulator, const uint stride,
 	// next event estimation: connect eye path to light
 	if ((FLAGS & S_SPECULAR) == 0 && connections != 0) // skip for specular vertices
 	{
-		float pickProb, lightPdf = 0;
+		float pickProb, lightPdf = (pixelIdx == probePixelIdx && pathLength == 1) ? -999 : 0;
 		float3 lightColor, L;
-		if (pixelIdx % SCRWIDTH < SCRWIDTH / 2)
+		if (1) // pixelIdx % SCRWIDTH < SCRWIDTH / 2)
 		{
 			float3 jitter = make_float3( 1.0f / PNEErext.x, 1.0f / PNEErext.y, 1.0f / PNEErext.z );
 			float3 II = make_float3(
@@ -410,7 +410,7 @@ void shadeKernel( float4* accumulator, const uint stride,
 				const uint shadowRayIdx = atomicAdd( &counters->shadowRays, 1 ); // compaction
 				connections[shadowRayIdx] = make_float4( SafeOrigin( I, L, N, geometryEpsilon ), 0 ); // O4
 				connections[shadowRayIdx + stride * 2] = make_float4( L, dist - 2 * geometryEpsilon ); // D4
-				connections[shadowRayIdx + stride * 2 * 2] = make_float4( contribution, __int_as_float( pixelIdx ) ); // E4
+				connections[shadowRayIdx + stride * 2 * 2] = make_float4( contribution, __int_as_float( 0 ) ); // E4
 			}
 		}
 	}
diff --git a/lib/RenderCore_OptixPrime_B/kernels/.cuda.cu b/lib/RenderCore_OptixPrime_B/kernels/.cuda.cu
index ce28fa683..67e111e05 100644
--- a/lib/RenderCore_OptixPrime_B/kernels/.cuda.cu
+++ b/lib/RenderCore_OptixPrime_B/kernels/.cuda.cu
@@ -33,6 +33,7 @@ __constant__ float4* skyPixels;
 __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -43,7 +44,7 @@ __constant__ __device__ float clampValue;
 // staging: copies will be batched and carried out after rendering completes, 
 // to allow the CPU to update the scene concurrently with GPU rendering.
 
-enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV };
+enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, LTREE };
 
 // device pointers are not real pointers for nvcc, so we need a bit of a hack.
 
@@ -73,6 +74,7 @@ __host__ static void pushPtrCpy( int id, void* p )
 	if (id == NRMLS) cudaMemcpyToSymbol( nrm32, &p, sizeof( void* ) );
 	if (id == SKYPIX) cudaMemcpyToSymbol( skyPixels, &p, sizeof( void* ) );
 	if (id == DBGDAT) cudaMemcpyToSymbol( debugData, &p, sizeof( void* ) );
+	if (id == LTREE) cudaMemcpyToSymbol( lightTree, &p, sizeof( void* ) );
 }
 __host__ static void pushIntCpy( int id, const int v )
 {
@@ -150,6 +152,7 @@ __host__ void stageWorldToSky( const mat4& worldToLight ) { stageMatCpy( SMAT /*
 __host__ void stageDebugData( float4* p ) { stagePtrCpy( DBGDAT /* debugData */, p ); }
 __host__ void stageGeometryEpsilon( float e ) { stageF32Cpy( GEPS /* geometryEpsilon */, e ); }
 __host__ void stageClampValue( float c ) { stageF32Cpy( CLMPV /* clampValue */, c ); }
+__host__ void stageLightTree( LightCluster* t ) { stagePtrCpy( LTREE /* light tree */, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/RenderCore_OptixPrime_B/kernels/pathtracer.h b/lib/RenderCore_OptixPrime_B/kernels/pathtracer.h
index adbb1f928..566cb7e8c 100644
--- a/lib/RenderCore_OptixPrime_B/kernels/pathtracer.h
+++ b/lib/RenderCore_OptixPrime_B/kernels/pathtracer.h
@@ -1,4 +1,4 @@
-/* pathtracer.cu - Copyright 2019/2020 Utrecht University
+/* pathtracer.h - Copyright 2019/2020 Utrecht University
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/lib/RenderCore_OptixPrime_BDPT/kernels/.cuda.cu b/lib/RenderCore_OptixPrime_BDPT/kernels/.cuda.cu
index 6c2b6914e..331c958ae 100644
--- a/lib/RenderCore_OptixPrime_BDPT/kernels/.cuda.cu
+++ b/lib/RenderCore_OptixPrime_BDPT/kernels/.cuda.cu
@@ -33,6 +33,7 @@ __constant__ float4* skyPixels;
 __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -50,6 +51,7 @@ __host__ void stageTriLights( CoreLightTri* p ) { stagedcpy( triLights, p ); }
 __host__ void stagePointLights( CorePointLight* p ) { stagedcpy( pointLights, p ); }
 __host__ void stageSpotLights( CoreSpotLight* p ) { stagedcpy( spotLights, p ); }
 __host__ void stageDirectionalLights( CoreDirectionalLight* p ) { stagedcpy( directionalLights, p ); }
+__host__ void stageLightTree( LightCluster* t ) { stagedcpy( lightTree, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/RenderCore_OptixPrime_PBRT/kernels/.cuda.cu b/lib/RenderCore_OptixPrime_PBRT/kernels/.cuda.cu
index 81d5aae28..ecddb1aea 100644
--- a/lib/RenderCore_OptixPrime_PBRT/kernels/.cuda.cu
+++ b/lib/RenderCore_OptixPrime_PBRT/kernels/.cuda.cu
@@ -36,6 +36,7 @@ __constant__ float4* skyPixels;
 __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -55,6 +56,7 @@ __host__ void stageTriLights( CoreLightTri* p ) { stagedcpy( triLights, p ); }
 __host__ void stagePointLights( CorePointLight* p ) { stagedcpy( pointLights, p ); }
 __host__ void stageSpotLights( CoreSpotLight* p ) { stagedcpy( spotLights, p ); }
 __host__ void stageDirectionalLights( CoreDirectionalLight* p ) { stagedcpy( directionalLights, p ); }
+__host__ void stageLightTree( LightCluster* t ) { stagedcpy( lightTree, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/RenderCore_PrimeAdaptive/kernels/.cuda.cu b/lib/RenderCore_PrimeAdaptive/kernels/.cuda.cu
index f80f5d583..68393b2aa 100644
--- a/lib/RenderCore_PrimeAdaptive/kernels/.cuda.cu
+++ b/lib/RenderCore_PrimeAdaptive/kernels/.cuda.cu
@@ -33,6 +33,7 @@ __constant__ float4* skyPixels;
 __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -43,7 +44,7 @@ __constant__ __device__ float clampValue;
 // staging: copies will be batched and carried out after rendering completes, 
 // to allow the CPU to update the scene concurrently with GPU rendering.
 
-enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV };
+enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, LTREE };
 
 // device pointers are not real pointers for nvcc, so we need a bit of a hack.
 
@@ -73,6 +74,7 @@ __host__ static void pushPtrCpy( int id, void* p )
 	if (id == NRMLS) cudaMemcpyToSymbol( nrm32, &p, sizeof( void* ) );
 	if (id == SKYPIX) cudaMemcpyToSymbol( skyPixels, &p, sizeof( void* ) );
 	if (id == DBGDAT) cudaMemcpyToSymbol( debugData, &p, sizeof( void* ) );
+	if (id == LTREE) cudaMemcpyToSymbol( lightTree, &p, sizeof( void* ) );
 }
 __host__ static void pushIntCpy( int id, const int v )
 {
@@ -150,6 +152,7 @@ __host__ void stageWorldToSky( const mat4& worldToLight ) { stageMatCpy( SMAT /*
 __host__ void stageDebugData( float4* p ) { stagePtrCpy( DBGDAT /* debugData */, p ); }
 __host__ void stageGeometryEpsilon( float e ) { stageF32Cpy( GEPS /* geometryEpsilon */, e ); }
 __host__ void stageClampValue( float c ) { stageF32Cpy( CLMPV /* clampValue */, c ); }
+__host__ void stageLightTree( LightCluster* t ) { stagePtrCpy( LTREE /* light tree */, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/rendercore_optix7/core_settings.h b/lib/rendercore_optix7/core_settings.h
index 5b995a162..253cc6b13 100644
--- a/lib/rendercore_optix7/core_settings.h
+++ b/lib/rendercore_optix7/core_settings.h
@@ -224,6 +224,7 @@ void stageDebugData( float4* p );
 void stageGeometryEpsilon( float e );
 void stageClampValue( float c );
 void stageMemcpy( void* d, void* s, int n );
+void stageLightTree( LightCluster* t );
 void pushStagedCopies();
 void SetCounters( Counters* p );
 
diff --git a/lib/rendercore_optix7/kernels/.cuda.cu b/lib/rendercore_optix7/kernels/.cuda.cu
index 2cdcb8dfe..eb855f3b1 100644
--- a/lib/rendercore_optix7/kernels/.cuda.cu
+++ b/lib/rendercore_optix7/kernels/.cuda.cu
@@ -34,6 +34,7 @@ __constant__ int skywidth;
 __constant__ int skyheight;
 __constant__ PathState* pathStates;
 __constant__ float4* debugData;
+__constant__ LightCluster* lightTree;
 
 __constant__ mat4 worldToSky;
 
@@ -44,7 +45,7 @@ __constant__ __device__ float clampValue;
 // staging: copies will be batched and carried out after rendering completes, 
 // to allow the CPU to update the scene concurrently with GPU rendering.
 
-enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV };
+enum { INSTS = 0, MATS, TLGHTS, PLGHTS, SLGHTS, DLGHTS, LCNTS, RGB32, RGBH, NRMLS, SKYPIX, SKYW, SKYH, SMAT, DBGDAT, GEPS, CLMPV, LTREE };
 
 // device pointers are not real pointers for nvcc, so we need a bit of a hack.
 
@@ -76,6 +77,7 @@ __host__ static void pushPtrCpy( int id, void* p )
 	if (id == NRMLS) cudaMemcpyToSymbol( nrm32, &p, sizeof( void* ) );
 	if (id == SKYPIX) cudaMemcpyToSymbol( skyPixels, &p, sizeof( void* ) );
 	if (id == DBGDAT) cudaMemcpyToSymbol( debugData, &p, sizeof( void* ) );
+	if (id == LTREE) cudaMemcpyToSymbol( lightTree, &p, sizeof( void* ) );
 }
 __host__ static void pushIntCpy( int id, const int v )
 {
@@ -166,6 +168,7 @@ __host__ void stageWorldToSky( const mat4& worldToLight ) { stageMatCpy( SMAT /*
 __host__ void stageDebugData( float4* p ) { stagePtrCpy( DBGDAT /* debugData */, p ); }
 __host__ void stageGeometryEpsilon( float e ) { stageF32Cpy( GEPS /* geometryEpsilon */, e ); }
 __host__ void stageClampValue( float c ) { stageF32Cpy( CLMPV /* clampValue */, c ); }
+__host__ void stageLightTree( LightCluster* t ) { stagePtrCpy( LTREE /* light tree */, t ); }
 __host__ void stageLightCounts( int tri, int point, int spot, int directional )
 {
 	const int4 counts = make_int4( tri, point, spot, directional );
diff --git a/lib/rendercore_optix7/kernels/pathtracer.h b/lib/rendercore_optix7/kernels/pathtracer.h
index e7ae40410..02a7abae7 100644
--- a/lib/rendercore_optix7/kernels/pathtracer.h
+++ b/lib/rendercore_optix7/kernels/pathtracer.h
@@ -142,6 +142,7 @@ void shadeKernel( float4* accumulator, const uint stride,
 				const CoreTri& tri = (const CoreTri&)instanceTriangles[PRIMIDX];
 				const float lightPdf = CalculateLightPDF( D, HIT_T, tri.area, N );
 				const float pickProb = LightPickProb( tri.ltriIdx, RAY_O, lastN, I /* the N at the previous vertex */ );
+				// const float pickProb = LightPickProbLTree( tri.ltriIdx, RAY_O, lastN, I /* the N at the previous vertex */, seed );
 				if ((bsdfPdf + lightPdf * pickProb) > 0) contribution = throughput * shadingData.color * (1.0f / (bsdfPdf + lightPdf * pickProb));
 			}
 			CLAMPINTENSITY;
@@ -183,6 +184,7 @@ void shadeKernel( float4* accumulator, const uint stride,
 	{
 		float pickProb, lightPdf = 0;
 		float3 lightColor, L = RandomPointOnLight( r4.x, r4.y, I, fN * faceDir, pickProb, lightPdf, lightColor ) - I;
+		// float3 lightColor, L = RandomPointOnLightLTree( r4.x, r4.y, seed, I, fN * faceDir, pickProb, lightPdf, lightColor, false ) - I;
 		const float dist = length( L );
 		L *= 1.0f / dist;
 		const float NdotL = dot( L, fN * faceDir );
diff --git a/lib/rendercore_optix7/rendercore.cpp b/lib/rendercore_optix7/rendercore.cpp
index cc9876a86..31c937f8b 100644
--- a/lib/rendercore_optix7/rendercore.cpp
+++ b/lib/rendercore_optix7/rendercore.cpp
@@ -565,6 +565,121 @@ void RenderCore::SetMaterials( CoreMaterial* mat, const int materialCount )
 	stageMaterialList( materialBuffer->DevPtr() );
 }
 
+//  +-----------------------------------------------------------------------------+
+//  |  RenderCore::UpdateLightTree                                                |
+//  |  Prepare the light BVH for stochastic lightcuts.                      LH2'20|
+//  +-----------------------------------------------------------------------------+
+void RenderCore::UpdateLightTreeNormals( const int node )
+{
+	LightCluster* treeData = lightTree->HostPtr();
+	if (treeData[node].left > -1)
+	{
+		UpdateLightTreeNormals( treeData[node].left );
+		UpdateLightTreeNormals( treeData[node].right );
+		// check normals of children
+		const float3 Nl = treeData[treeData[node].left].N;
+		const float3 Nr = treeData[treeData[node].right].N;
+		if (dot( Nl, Nr ) > 0.9f)
+		{
+			// left and right normals are similar enough to be useful
+			treeData[node].N = normalize( Nl + Nr );
+		}
+		else
+		{
+			// store an impossible normal
+			treeData[node].N = make_float3( 0 );
+		}
+		// store parent index in children
+		treeData[treeData[node].left].parent = treeData[treeData[node].right].parent = node;
+	}
+	else
+	{
+		// get normal from light source
+		const int lightIdx = treeData[node].light;
+		if (lightIdx & ((1 << 30) + (1 << 29)))
+		{
+			// point or spot light
+			treeData[node].N = make_float3( 0 );
+		}
+		else
+		{
+			// triangle light
+			CoreLightTri& light = triLightBuffer->HostPtr()[lightIdx];
+			treeData[node].N = light.N;
+		}
+	}
+}
+int RenderCore::FindBestMatch( int* todo, const int idx, const int N )
+{
+	float bestCost = 1e34f, bestIdx = 0;
+	LightCluster* treeData = lightTree->HostPtr();
+	for (int i = 0; i < N; i++) if (i != idx)
+	{
+		LightCluster tmp = treeData[todo[idx]];
+		tmp.bounds.Grow( treeData[todo[i]].bounds );
+		tmp.intensity += treeData[todo[i]].intensity;
+		float cost = tmp.Cost();
+		if (cost < bestCost) bestCost = cost, bestIdx = i;
+	}
+	return bestIdx;
+}
+void RenderCore::UpdateLightTree()
+{
+	// create an array of triLights
+	delete lightTree;
+	int N = triLightBuffer->GetSize(), remaining = N;
+	int M = pointLightBuffer->GetSize();
+	int O = spotLightBuffer->GetSize();
+	lightTree = new CoreBuffer<LightCluster>( (N + M + O) * 2, ON_HOST | ON_DEVICE | STAGED );
+	LightCluster* treeData = lightTree->HostPtr();
+	int* todo = new int[N + M + O];
+	for (int i = 0; i < N; i++)
+		treeData[i + 1] = LightCluster( triLightBuffer->HostPtr()[i], i ), // leaf for light i has index i + 1
+		todo[i] = i + 1;
+	for( int i = 0; i < M; i++ )
+		treeData[i + 1 + N] = LightCluster( pointLightBuffer->HostPtr()[i], i ),
+		todo[i + N] = i + 1 + N;
+	for( int i = 0; i < O; i++ )
+		treeData[i + 1 + N + M] = LightCluster( spotLightBuffer->HostPtr()[i], i ),
+		todo[i + N + M] = i + 1 + N + M;
+	remaining += M + O;
+	N += M + O;
+	// build the BVH, agglomerative
+	int A = 0;
+	int B = FindBestMatch( todo, A, remaining );
+	while (remaining > 1)
+	{
+		int C = FindBestMatch( todo, B, remaining );
+		if (A == C)
+		{
+			// create a new cluster
+			treeData[N + 1] = treeData[todo[A]];
+			treeData[N + 1].bounds.Grow( treeData[todo[B]].bounds );
+			treeData[N + 1].intensity += treeData[todo[B]].intensity;
+			treeData[N + 1].left = todo[A];
+			treeData[N + 1].right = todo[B];
+			// delete A and B cluster indices from 'todo'
+			for (int i = A; i < remaining - 1; i++) todo[i] = todo[i + 1]; // remove A
+			if (B > A) B--;
+			for (int i = B; i < remaining - 2; i++) todo[i] = todo[i + 1]; // remove B
+			remaining -= 2;
+			// add the new cluster index to 'todo'
+			todo[remaining] = ++N;
+			// prepare search for next couple
+			A = remaining++;
+			B = FindBestMatch( todo, A, remaining );
+		}
+		else A = B, B = C;
+	}
+	// finalize
+	treeData[0] = treeData[todo[0]]; // put root in convenient place
+	delete[] todo;
+	UpdateLightTreeNormals( 0 );
+	// copy to device
+	stageLightTree( lightTree->DevPtr() );
+	lightTree->StageCopyToDevice();
+}
+
 //  +-----------------------------------------------------------------------------+
 //  |  RenderCore::SetLights                                                      |
 //  |  Set the light data.                                                  LH2'20|
@@ -592,6 +707,7 @@ void RenderCore::SetLights( const CoreLightTri* triLights, const int triLightCou
 	stageDirectionalLights( StagedBufferResize<CoreDirectionalLight>( directionalLightBuffer, directionalLightCount, directionalLights ) );
 	stageLightCounts( triLightCount, pointLightCount, spotLightCount, directionalLightCount );
 	noDirectLightsInScene = (triLightCount + pointLightCount + spotLightCount + directionalLightCount) == 0;
+	UpdateLightTree();
 }
 
 //  +-----------------------------------------------------------------------------+
diff --git a/lib/rendercore_optix7/rendercore.h b/lib/rendercore_optix7/rendercore.h
index f5d45d9fb..aac37594e 100644
--- a/lib/rendercore_optix7/rendercore.h
+++ b/lib/rendercore_optix7/rendercore.h
@@ -70,6 +70,9 @@ class RenderCore : public CoreAPI_Base
 	void FinalizeRender();
 	template <class T> T* StagedBufferResize( CoreBuffer<T>*& lightBuffer, const int newCount, const T* sourceData );
 	void UpdateToplevel();
+	int FindBestMatch( int* todo, const int idx, const int N );
+	void UpdateLightTreeNormals( const int node );
+	void UpdateLightTree();
 	void SyncStorageType( const TexelStorage storage );
 	void CreateOptixContext( int cc );
 	// helpers
@@ -97,6 +100,7 @@ class RenderCore : public CoreAPI_Base
 	CoreBuffer<CoreLightTri>* triLightBuffer;		// tri lights
 	CoreBuffer<CorePointLight>* pointLightBuffer;	// point lights
 	CoreBuffer<CoreSpotLight>* spotLightBuffer;		// spot lights
+	CoreBuffer<LightCluster>* lightTree = 0;		// light tree for stochastic lightcuts
 	CoreBuffer<CoreDirectionalLight>* directionalLightBuffer;	// directional lights
 	CoreBuffer<float4>* texel128Buffer = 0;			// texel buffer 1: hdr ARGB128 texture data
 	CoreBuffer<uint>* normal32Buffer = 0;			// texel buffer 2: integer-encoded normals