diff --git a/QuickView/ImageLoader.cpp b/QuickView/ImageLoader.cpp index 03bfe69..4906d35 100644 --- a/QuickView/ImageLoader.cpp +++ b/QuickView/ImageLoader.cpp @@ -9385,6 +9385,106 @@ HRESULT CImageLoader::ComputeHistogram(IWICBitmapSource* source, ImageMetadata* return E_FAIL; } + +#ifndef HWY_TARGETS +#if defined(_M_X64) || defined(__x86_64__) + #undef HWY_BASELINE_TARGETS + #define HWY_BASELINE_TARGETS (HWY_SSE4) +#endif +#endif +#include + +HWY_BEFORE_NAMESPACE(); +namespace SIMD_ImageLoader { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +inline void ComputeHistRow(const uint8_t* row, int width, uint32_t* HistR, uint32_t* HistG, uint32_t* HistB, uint32_t* HistL, int& x_out) { + int x = 0; + const hn::ScalableTag d8; + const hn::ScalableTag d16; + const hn::ScalableTag d32; + const size_t N8 = hn::Lanes(d8); + const int step = (int)N8; + + for (; x + step <= width; x += step) { + auto vB = hn::Zero(d8); + auto vG = hn::Zero(d8); + auto vR = hn::Zero(d8); + auto vA = hn::Zero(d8); + hn::LoadInterleaved4(d8, row + x * 4, vB, vG, vR, vA); + + alignas(128) uint8_t b_arr[128]; + alignas(128) uint8_t g_arr[128]; + alignas(128) uint8_t r_arr[128]; + hn::Store(vB, d8, b_arr); + hn::Store(vG, d8, g_arr); + hn::Store(vR, d8, r_arr); + + auto r16_lo = hn::PromoteLowerTo(d16, vR); + auto r16_hi = hn::PromoteUpperTo(d16, vR); + auto g16_lo = hn::PromoteLowerTo(d16, vG); + auto g16_hi = hn::PromoteUpperTo(d16, vG); + auto b16_lo = hn::PromoteLowerTo(d16, vB); + auto b16_hi = hn::PromoteUpperTo(d16, vB); + + auto r32_lo_lo = hn::PromoteLowerTo(d32, r16_lo); + auto r32_lo_hi = hn::PromoteUpperTo(d32, r16_lo); + auto r32_hi_lo = hn::PromoteLowerTo(d32, r16_hi); + auto r32_hi_hi = hn::PromoteUpperTo(d32, r16_hi); + + auto g32_lo_lo = hn::PromoteLowerTo(d32, g16_lo); + auto g32_lo_hi = hn::PromoteUpperTo(d32, g16_lo); + auto g32_hi_lo = hn::PromoteLowerTo(d32, g16_hi); + auto g32_hi_hi = hn::PromoteUpperTo(d32, g16_hi); + + auto b32_lo_lo = hn::PromoteLowerTo(d32, b16_lo); + auto b32_lo_hi = hn::PromoteUpperTo(d32, b16_lo); + auto b32_hi_lo = hn::PromoteLowerTo(d32, b16_hi); + auto b32_hi_hi = hn::PromoteUpperTo(d32, b16_hi); + + auto c299 = hn::Set(d32, 299); + auto c587 = hn::Set(d32, 587); + auto c114 = hn::Set(d32, 114); + auto c500 = hn::Set(d32, 500); + auto c1000 = hn::Set(d32, 1000); + + auto calc_l = [&](auto r, auto g, auto b) { + auto sum = hn::Add(hn::Add(hn::Add(hn::Mul(r, c299), hn::Mul(g, c587)), hn::Mul(b, c114)), c500); + return hn::Div(sum, c1000); + }; + + auto l_lo_lo = calc_l(r32_lo_lo, g32_lo_lo, b32_lo_lo); + auto l_lo_hi = calc_l(r32_lo_hi, g32_lo_hi, b32_lo_hi); + auto l_hi_lo = calc_l(r32_hi_lo, g32_hi_lo, b32_hi_lo); + auto l_hi_hi = calc_l(r32_hi_hi, g32_hi_hi, b32_hi_hi); + + auto l16_lo = hn::OrderedDemote2To(d16, l_lo_lo, l_lo_hi); + auto l16_hi = hn::OrderedDemote2To(d16, l_hi_lo, l_hi_hi); + auto l8 = hn::OrderedDemote2To(d8, l16_lo, l16_hi); + + alignas(128) uint8_t l_arr[128]; + hn::Store(l8, d8, l_arr); + + for (int j = 0; j < step; ++j) { + HistB[b_arr[j]]++; + HistG[g_arr[j]]++; + HistR[r_arr[j]]++; + HistL[l_arr[j]]++; + } + } + x_out = x; +} +} // HWY_NAMESPACE +} // SIMD_ImageLoader +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace SIMD_ImageLoader { + HWY_EXPORT(ComputeHistRow); +} +#endif + // [v5.2] Histogram from RawImageFrame (for HeavyLanePool pipeline) void CImageLoader::ComputeHistogramFromFrame(const QuickView::RawImageFrame& frame, ImageMetadata* pMetadata) { if (!frame.pixels || frame.width == 0 || frame.height == 0 || !pMetadata) return; @@ -9411,93 +9511,16 @@ void CImageLoader::ComputeHistogramFromFrame(const QuickView::RawImageFrame& fra double lapSumSq = 0.0; uint64_t lapCount = 0; - // Assume BGRA8888 (standard for RawImageFrame) // Assume BGRA8888 (standard for RawImageFrame) for (UINT y = 0; y < frame.height; y += stepY) { const uint8_t* row = ptr + (UINT64)y * stride; UINT x = 0; - // Process 8 pixels per iteration. - const UINT width8 = frame.width & ~7u; - -#if QUICKVIEW_USE_STD_SIMD_HIST - using u32x8 = std::simd>; - alignas(32) uint32_t bLane[8]; - alignas(32) uint32_t gLane[8]; - alignas(32) uint32_t rLane[8]; - alignas(32) uint32_t lLane[8]; - - for (; x < width8; x += 8) { - const uint8_t* p = row + x * 4; - for (int i = 0; i < 8; ++i) { - const uint8_t* px = p + i * 4; - const uint32_t b = px[0]; - const uint32_t g = px[1]; - const uint32_t r = px[2]; - bLane[i] = b; - gLane[i] = g; - rLane[i] = r; - pMetadata->HistB[b]++; - pMetadata->HistG[g]++; - pMetadata->HistR[r]++; - } - - u32x8 vb; - u32x8 vg; - u32x8 vr; - vb.copy_from(bLane, std::element_aligned); - vg.copy_from(gLane, std::element_aligned); - vr.copy_from(rLane, std::element_aligned); - - const u32x8 vl = (vr * 299u + vg * 587u + vb * 114u + 500u) / 1000u; - vl.copy_to(lLane, std::element_aligned); - - for (int i = 0; i < 8; ++i) { - pMetadata->HistL[lLane[i]]++; - } - } -#else - // [AVX2] SIMD Optimization for Luminance Calculation - const __m256i vCoeffs = _mm256_set1_epi64x(0x0000012B024B0072); - // Multiply by 8389 and shift by 23: 8389 / 2^23 = 8389 / 8388608 ≈ 0.000999999 - // Max Luma = 255 * 1000 = 255000. Max mult = 255000 * 8389 = 2,139,195,000 < 2^31 - const __m256i vMul = _mm256_set1_epi32(8389); - - for (; x < width8; x += 8) { - __m256i vPixels = _mm256_loadu_si256((const __m256i*)(row + x * 4)); - __m256i vPix03 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vPixels)); - __m256i vPix47 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(vPixels, 1)); - __m256i vSum03 = _mm256_madd_epi16(vPix03, vCoeffs); - __m256i vSum47 = _mm256_madd_epi16(vPix47, vCoeffs); - __m256i vLuma03 = _mm256_hadd_epi32(vSum03, vSum03); - __m256i vLuma47 = _mm256_hadd_epi32(vSum47, vSum47); - __m256i vDiv03 = _mm256_srli_epi32(_mm256_mullo_epi32(vLuma03, vMul), 23); - __m256i vDiv47 = _mm256_srli_epi32(_mm256_mullo_epi32(vLuma47, vMul), 23); - - uint32_t l0 = _mm256_cvtsi256_si32(vDiv03); - uint32_t l1 = _mm256_extract_epi32(vDiv03, 1); - uint32_t l2 = _mm256_extract_epi32(vDiv03, 4); - uint32_t l3 = _mm256_extract_epi32(vDiv03, 5); - uint32_t l4 = _mm256_cvtsi256_si32(vDiv47); - uint32_t l5 = _mm256_extract_epi32(vDiv47, 1); - uint32_t l6 = _mm256_extract_epi32(vDiv47, 4); - uint32_t l7 = _mm256_extract_epi32(vDiv47, 5); - - const uint8_t* p = row + x * 4; - pMetadata->HistB[p[0]]++; pMetadata->HistG[p[1]]++; pMetadata->HistR[p[2]]++; pMetadata->HistL[l0]++; - pMetadata->HistB[p[4]]++; pMetadata->HistG[p[5]]++; pMetadata->HistR[p[6]]++; pMetadata->HistL[l1]++; - pMetadata->HistB[p[8]]++; pMetadata->HistG[p[9]]++; pMetadata->HistR[p[10]]++; pMetadata->HistL[l2]++; - pMetadata->HistB[p[12]]++; pMetadata->HistG[p[13]]++; pMetadata->HistR[p[14]]++; pMetadata->HistL[l3]++; - pMetadata->HistB[p[16]]++; pMetadata->HistG[p[17]]++; pMetadata->HistR[p[18]]++; pMetadata->HistL[l4]++; - pMetadata->HistB[p[20]]++; pMetadata->HistG[p[21]]++; pMetadata->HistR[p[22]]++; pMetadata->HistL[l5]++; - pMetadata->HistB[p[24]]++; pMetadata->HistG[p[25]]++; pMetadata->HistR[p[26]]++; pMetadata->HistL[l6]++; - pMetadata->HistB[p[28]]++; pMetadata->HistG[p[29]]++; pMetadata->HistR[p[30]]++; pMetadata->HistL[l7]++; - } -#endif + int x_out = 0; + HWY_DYNAMIC_DISPATCH(SIMD_ImageLoader::ComputeHistRow)(row, frame.width, pMetadata->HistR.data(), pMetadata->HistG.data(), pMetadata->HistB.data(), pMetadata->HistL.data(), x_out); + x = (UINT)x_out; for (; x < frame.width; x++) { - // Unrolling or SIMD could be added here, but scalar is fast enough with skip sampling. - // Layout: B, G, R, A uint8_t b = row[x * 4 + 0]; uint8_t g = row[x * 4 + 1]; uint8_t r = row[x * 4 + 2]; diff --git a/QuickView/QuickView.vcxproj b/QuickView/QuickView.vcxproj index 615ee6c..9b6e288 100644 --- a/QuickView/QuickView.vcxproj +++ b/QuickView/QuickView.vcxproj @@ -96,7 +96,6 @@ MaxSpeed AnySuitable false - AdvancedVectorExtensions2 Fast stdcpplatest /Ob2 /sdl- /utf-8 /Gw /Zc:inline /GR- %(AdditionalOptions) diff --git a/QuickView/SIMDUtils.h b/QuickView/SIMDUtils.h index 1ce41e6..4cde1f4 100644 --- a/QuickView/SIMDUtils.h +++ b/QuickView/SIMDUtils.h @@ -2,9 +2,9 @@ #include #include #include -#include #include #include + #if defined(__has_include) #if __has_include() #include @@ -21,447 +21,266 @@ #else #define QVIEW_SIMDUTILS_USE_STD_SIMD_RESIZE 0 #endif + #include "SystemInfo.h" -namespace SIMDUtils { - - // One-time hardware detection (shared across all translation units) - inline bool HasAVX512F() { - return SystemInfo::Cached().hasAVX512F; - } +// Highway dynamic dispatch configuration +#ifndef HWY_TARGETS +#if defined(_M_X64) || defined(__x86_64__) + #undef HWY_BASELINE_TARGETS + #define HWY_BASELINE_TARGETS (HWY_SSE4) +#endif +#endif - // Fast Premultiply Alpha - inline void PremultiplyAlpha_BGRA(uint8_t* pData, int width, int height, int stride = 0) { - if (stride == 0) stride = width * 4; - - const __m256i shuffleMask = _mm256_setr_epi8( - 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15, - 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15 - ); - const int blendMask = 0x88; - - for (int y = 0; y < height; ++y) { - uint8_t* row = pData + (size_t)y * stride; - int x = 0; - - // AVX-512 Loop - if (HasAVX512F()) { - const __m512i shuffleMask512 = _mm512_broadcast_i32x4(_mm_setr_epi8( - 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15 - )); - - for (; x <= width - 16; x += 16) { - uint8_t* p = row + x * 4; - __m512i src = _mm512_loadu_si512(p); - __m512i alphas8 = _mm512_shuffle_epi8(src, shuffleMask512); - - __m512i pLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(src)); - __m512i pHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(src, 1)); - __m512i aLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(alphas8)); - __m512i aHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(alphas8, 1)); - - __m512i mulLo = _mm512_srli_epi16(_mm512_mullo_epi16(pLo, aLo), 8); - __m512i mulHi = _mm512_srli_epi16(_mm512_mullo_epi16(pHi, aHi), 8); - - mulLo = _mm512_mask_blend_epi16(0x88888888, mulLo, pLo); - mulHi = _mm512_mask_blend_epi16(0x88888888, mulHi, pHi); - - __m512i packed = _mm512_packus_epi16(mulLo, mulHi); - packed = _mm512_permutex_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm512_storeu_si512(p, packed); - } - } +#include - // AVX-2 Loop - for (; x <= width - 8; x += 8) { - uint8_t* p = row + x * 4; - __m256i src = _mm256_loadu_si256((__m256i*)p); - __m256i alphas8 = _mm256_shuffle_epi8(src, shuffleMask); - __m256i pLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(src)); - __m256i pHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(src, 1)); - __m256i aLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(alphas8)); - __m256i aHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(alphas8, 1)); - - __m256i mulLo = _mm256_srli_epi16(_mm256_mullo_epi16(pLo, aLo), 8); - __m256i mulHi = _mm256_srli_epi16(_mm256_mullo_epi16(pHi, aHi), 8); - - mulLo = _mm256_blend_epi16(mulLo, pLo, blendMask); - mulHi = _mm256_blend_epi16(mulHi, pHi, blendMask); - - __m256i packed = _mm256_packus_epi16(mulLo, mulHi); - packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_storeu_si256((__m256i*)p, packed); - } +HWY_BEFORE_NAMESPACE(); +namespace SIMDUtils { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +// Fast Premultiply Alpha +inline void PremultiplyAlpha_BGRA_Impl(uint8_t* pData, int width, int height, int stride) { + if (stride == 0) stride = width * 4; + + const hn::ScalableTag d8; + const hn::ScalableTag d16; + const size_t N8 = hn::Lanes(d8); + const int step = (int)N8; + + for (int y = 0; y < height; ++y) { + uint8_t* row = pData + (size_t)y * stride; + int x = 0; + + for (; x + step <= width; x += step) { + uint8_t* p = row + x * 4; + auto vB = hn::Zero(d8); + auto vG = hn::Zero(d8); + auto vR = hn::Zero(d8); + auto vA = hn::Zero(d8); - // Scalar fallback - for (; x < width; ++x) { - uint8_t* px = row + x * 4; - uint8_t alpha = px[3]; - if (alpha == 0) { - px[0] = px[1] = px[2] = 0; - } else { - px[0] = (uint8_t)((px[0] * alpha) >> 8); - px[1] = (uint8_t)((px[1] * alpha) >> 8); - px[2] = (uint8_t)((px[2] * alpha) >> 8); - } - } + hn::LoadInterleaved4(d8, p, vB, vG, vR, vA); + + auto a16_lo = hn::PromoteLowerTo(d16, vA); + auto a16_hi = hn::PromoteUpperTo(d16, vA); + + auto b16_lo = hn::PromoteLowerTo(d16, vB); + auto b16_hi = hn::PromoteUpperTo(d16, vB); + auto b16_lo_mul = hn::ShiftRight<8>(hn::Mul(b16_lo, a16_lo)); + auto b16_hi_mul = hn::ShiftRight<8>(hn::Mul(b16_hi, a16_hi)); + vB = hn::OrderedDemote2To(d8, b16_lo_mul, b16_hi_mul); + + auto g16_lo = hn::PromoteLowerTo(d16, vG); + auto g16_hi = hn::PromoteUpperTo(d16, vG); + auto g16_lo_mul = hn::ShiftRight<8>(hn::Mul(g16_lo, a16_lo)); + auto g16_hi_mul = hn::ShiftRight<8>(hn::Mul(g16_hi, a16_hi)); + vG = hn::OrderedDemote2To(d8, g16_lo_mul, g16_hi_mul); + + auto r16_lo = hn::PromoteLowerTo(d16, vR); + auto r16_hi = hn::PromoteUpperTo(d16, vR); + auto r16_lo_mul = hn::ShiftRight<8>(hn::Mul(r16_lo, a16_lo)); + auto r16_hi_mul = hn::ShiftRight<8>(hn::Mul(r16_hi, a16_hi)); + vR = hn::OrderedDemote2To(d8, r16_lo_mul, r16_hi_mul); + + hn::StoreInterleaved4(vB, vG, vR, vA, d8, p); } - } - // SIMD Swizzle RGBA→BGRA with Alpha Optimization - inline void SwizzleRGBA_to_BGRA_Premul(uint8_t* pData, size_t pixelCount) { - uint8_t* p = pData; - size_t i = 0; - - // AVX-512 Swizzle - if (HasAVX512F()) { - const __m512i swizzleMask512 = _mm512_broadcast_i32x4(_mm_setr_epi8( - 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 - )); - const __m512i alphaMask512 = _mm512_broadcast_i32x4(_mm_setr_epi8( - 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15 - )); - - for (; i + 16 <= pixelCount; i += 16) { - __m512i src = _mm512_loadu_si512(p + i * 4); - __m512i swizzled = _mm512_shuffle_epi8(src, swizzleMask512); - __m512i alphas8 = _mm512_shuffle_epi8(src, alphaMask512); - - __m512i pLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(swizzled)); - __m512i pHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(swizzled, 1)); - __m512i aLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(alphas8)); - __m512i aHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(alphas8, 1)); - - __m512i mulLo = _mm512_srli_epi16(_mm512_mullo_epi16(pLo, aLo), 8); - __m512i mulHi = _mm512_srli_epi16(_mm512_mullo_epi16(pHi, aHi), 8); - - mulLo = _mm512_mask_blend_epi16(0x88888888, mulLo, pLo); - mulHi = _mm512_mask_blend_epi16(0x88888888, mulHi, pHi); - - __m512i packed = _mm512_packus_epi16(mulLo, mulHi); - packed = _mm512_permutex_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm512_storeu_si512(p + i * 4, packed); + for (; x < width; ++x) { + uint8_t* px = row + x * 4; + uint8_t alpha = px[3]; + if (alpha == 0) { + px[0] = px[1] = px[2] = 0; + } else { + px[0] = (uint8_t)((px[0] * alpha) >> 8); + px[1] = (uint8_t)((px[1] * alpha) >> 8); + px[2] = (uint8_t)((px[2] * alpha) >> 8); } } + } +} + +// SIMD Swizzle RGBA to BGRA +inline void SwizzleRGBA_to_BGRA_Premul_Impl(uint8_t* pData, size_t pixelCount) { + uint8_t* p = pData; + size_t i = 0; + + const hn::ScalableTag d8; + const hn::ScalableTag d16; + const size_t N8 = hn::Lanes(d8); + const int step = (int)N8; + + for (; i + step <= pixelCount; i += step) { + auto vR = hn::Zero(d8); + auto vG = hn::Zero(d8); + auto vB = hn::Zero(d8); + auto vA = hn::Zero(d8); + + hn::LoadInterleaved4(d8, p + i * 4, vR, vG, vB, vA); + + auto a16_lo = hn::PromoteLowerTo(d16, vA); + auto a16_hi = hn::PromoteUpperTo(d16, vA); + + auto b16_lo = hn::PromoteLowerTo(d16, vB); + auto b16_hi = hn::PromoteUpperTo(d16, vB); + auto b16_lo_mul = hn::ShiftRight<8>(hn::Mul(b16_lo, a16_lo)); + auto b16_hi_mul = hn::ShiftRight<8>(hn::Mul(b16_hi, a16_hi)); + vB = hn::OrderedDemote2To(d8, b16_lo_mul, b16_hi_mul); + + auto g16_lo = hn::PromoteLowerTo(d16, vG); + auto g16_hi = hn::PromoteUpperTo(d16, vG); + auto g16_lo_mul = hn::ShiftRight<8>(hn::Mul(g16_lo, a16_lo)); + auto g16_hi_mul = hn::ShiftRight<8>(hn::Mul(g16_hi, a16_hi)); + vG = hn::OrderedDemote2To(d8, g16_lo_mul, g16_hi_mul); + + auto r16_lo = hn::PromoteLowerTo(d16, vR); + auto r16_hi = hn::PromoteUpperTo(d16, vR); + auto r16_lo_mul = hn::ShiftRight<8>(hn::Mul(r16_lo, a16_lo)); + auto r16_hi_mul = hn::ShiftRight<8>(hn::Mul(r16_hi, a16_hi)); + vR = hn::OrderedDemote2To(d8, r16_lo_mul, r16_hi_mul); + + hn::StoreInterleaved4(vB, vG, vR, vA, d8, p + i * 4); + } - // AVX2 Swizzle - const __m256i swizzleMask = _mm256_setr_epi8( - 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15, - 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 - ); - const __m256i alphaBroadcastMask = _mm256_setr_epi8( - 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15, - 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15 - ); - - for (; i + 8 <= pixelCount; i += 8) { - __m256i src = _mm256_loadu_si256((__m256i*)(p + i * 4)); - __m256i swizzled = _mm256_shuffle_epi8(src, swizzleMask); - __m256i alphas8 = _mm256_shuffle_epi8(src, alphaBroadcastMask); - - __m256i pLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(swizzled)); - __m256i pHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(swizzled, 1)); - __m256i aLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(alphas8)); - __m256i aHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(alphas8, 1)); - - __m256i mulLo = _mm256_srli_epi16(_mm256_mullo_epi16(pLo, aLo), 8); - __m256i mulHi = _mm256_srli_epi16(_mm256_mullo_epi16(pHi, aHi), 8); - - mulLo = _mm256_blend_epi16(mulLo, pLo, 0x88); - mulHi = _mm256_blend_epi16(mulHi, pHi, 0x88); - - __m256i packed = _mm256_packus_epi16(mulLo, mulHi); - packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_storeu_si256((__m256i*)(p + i * 4), packed); - } - - // Scalar Fallback - for (; i < pixelCount; ++i) { - uint8_t r = p[i*4+0]; - uint8_t g = p[i*4+1]; - uint8_t b = p[i*4+2]; - uint8_t a = p[i*4+3]; - if (a == 255) { - p[i*4+0] = b; - p[i*4+2] = r; - } else if (a == 0) { - p[i*4+0] = 0; p[i*4+1] = 0; p[i*4+2] = 0; - } else { - p[i*4+0] = (uint8_t)((b * a + 127) / 255); - p[i*4+1] = (uint8_t)((g * a + 127) / 255); - p[i*4+2] = (uint8_t)((r * a + 127) / 255); - } - } + for (; i < pixelCount; ++i) { + uint8_t r = p[i*4+0]; + uint8_t g = p[i*4+1]; + uint8_t b = p[i*4+2]; + uint8_t a = p[i*4+3]; + if (a == 255) { + p[i*4+0] = b; + p[i*4+2] = r; + } else if (a == 0) { + p[i*4+0] = 0; p[i*4+1] = 0; p[i*4+2] = 0; + } else { + p[i*4+0] = (uint8_t)((b * a + 127) / 255); + p[i*4+1] = (uint8_t)((g * a + 127) / 255); + p[i*4+2] = (uint8_t)((r * a + 127) / 255); + } } +} - // Bilinear Resize with precomputed coefficients + optional std::simd channel path - inline void ResizeBilinear(const uint8_t* src, int w, int h, int srcStride, uint8_t* dst, int newW, int newH, int dstStride = 0) { - if (!src || !dst || w <= 0 || h <= 0 || newW <= 0 || newH <= 0) return; - if (srcStride == 0) srcStride = w * 4; - if (dstStride == 0) dstStride = newW * 4; - - constexpr int kWeightBits = 11; - constexpr int kWeightScale = 1 << kWeightBits; // 2048 - constexpr int kWeightShift = kWeightBits * 2; // 22 - constexpr int kWeightRound = 1 << (kWeightShift - 1); - - struct AxisCoeff { - int idx0; - int idx1; - int w0; - int w1; - }; - - auto buildAxisCoeff = [kWeightScale](int srcSize, int dstSize, std::vector& out) { - out.resize(dstSize); - if (srcSize <= 1 || dstSize <= 1) { - for (int i = 0; i < dstSize; ++i) { - out[i].idx0 = 0; - out[i].idx1 = 0; - out[i].w0 = kWeightScale; - out[i].w1 = 0; - } - return; - } +// Find Peak +inline float FindPeak_R32G32B32A32_FLOAT_Impl(const float* pData, size_t pixelCount) { + if (!pData || pixelCount == 0) return 1.0f; - const double scale = static_cast(srcSize - 1) / static_cast(dstSize - 1); - for (int i = 0; i < dstSize; ++i) { - const double srcPos = static_cast(i) * scale; - int idx0 = static_cast(srcPos); - idx0 = (std::clamp)(idx0, 0, srcSize - 1); - const int idx1 = (std::min)(idx0 + 1, srcSize - 1); - - const double frac = srcPos - static_cast(idx0); - int w1 = static_cast(frac * static_cast(kWeightScale) + 0.5); - w1 = (std::clamp)(w1, 0, kWeightScale); - const int w0 = kWeightScale - w1; - - out[i].idx0 = idx0; - out[i].idx1 = idx1; - out[i].w0 = w0; - out[i].w1 = w1; - } - }; - - std::vector xCoeff; - std::vector yCoeff; - buildAxisCoeff(w, newW, xCoeff); - buildAxisCoeff(h, newH, yCoeff); - - // Process rows - for (int y = 0; y < newH; ++y) { - const AxisCoeff yc = yCoeff[y]; - const uint8_t* row0 = src + static_cast(yc.idx0) * static_cast(srcStride); - const uint8_t* row1 = src + static_cast(yc.idx1) * static_cast(srcStride); - uint8_t* pd = dst + static_cast(y) * static_cast(dstStride); - - int x = 0; - - // AVX2 unrolled 4-pixel loop using 32-bit math for absolute precision - for (; x + 3 < newW; x += 4) { - // Fetch indices - const int i00 = xCoeff[x+0].idx0 * 4; const int i01 = xCoeff[x+0].idx1 * 4; - const int i10 = xCoeff[x+1].idx0 * 4; const int i11 = xCoeff[x+1].idx1 * 4; - const int i20 = xCoeff[x+2].idx0 * 4; const int i21 = xCoeff[x+2].idx1 * 4; - const int i30 = xCoeff[x+3].idx0 * 4; const int i31 = xCoeff[x+3].idx1 * 4; - - // Load 4 pixels per variable - __m128i v_s00 = _mm_set_epi32(*(const uint32_t*)(row0 + i30), *(const uint32_t*)(row0 + i20), *(const uint32_t*)(row0 + i10), *(const uint32_t*)(row0 + i00)); - __m128i v_s01 = _mm_set_epi32(*(const uint32_t*)(row0 + i31), *(const uint32_t*)(row0 + i21), *(const uint32_t*)(row0 + i11), *(const uint32_t*)(row0 + i01)); - __m128i v_s10 = _mm_set_epi32(*(const uint32_t*)(row1 + i30), *(const uint32_t*)(row1 + i20), *(const uint32_t*)(row1 + i10), *(const uint32_t*)(row1 + i00)); - __m128i v_s11 = _mm_set_epi32(*(const uint32_t*)(row1 + i31), *(const uint32_t*)(row1 + i21), *(const uint32_t*)(row1 + i11), *(const uint32_t*)(row1 + i01)); - - // Unpack to 32-bit floats - __m256i p00_lo = _mm256_cvtepu8_epi32(v_s00); - __m256i p00_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s00, 8)); - __m256i p01_lo = _mm256_cvtepu8_epi32(v_s01); - __m256i p01_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s01, 8)); - __m256i p10_lo = _mm256_cvtepu8_epi32(v_s10); - __m256i p10_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s10, 8)); - __m256i p11_lo = _mm256_cvtepu8_epi32(v_s11); - __m256i p11_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s11, 8)); - - // Weights calculation - int32_t w00_0 = xCoeff[x+0].w0 * yc.w0; int32_t w01_0 = xCoeff[x+0].w1 * yc.w0; - int32_t w10_0 = xCoeff[x+0].w0 * yc.w1; int32_t w11_0 = xCoeff[x+0].w1 * yc.w1; - - int32_t w00_1 = xCoeff[x+1].w0 * yc.w0; int32_t w01_1 = xCoeff[x+1].w1 * yc.w0; - int32_t w10_1 = xCoeff[x+1].w0 * yc.w1; int32_t w11_1 = xCoeff[x+1].w1 * yc.w1; - - int32_t w00_2 = xCoeff[x+2].w0 * yc.w0; int32_t w01_2 = xCoeff[x+2].w1 * yc.w0; - int32_t w10_2 = xCoeff[x+2].w0 * yc.w1; int32_t w11_2 = xCoeff[x+2].w1 * yc.w1; - - int32_t w00_3 = xCoeff[x+3].w0 * yc.w0; int32_t w01_3 = xCoeff[x+3].w1 * yc.w0; - int32_t w10_3 = xCoeff[x+3].w0 * yc.w1; int32_t w11_3 = xCoeff[x+3].w1 * yc.w1; - - // Broadcast weights across pixel channels - __m256i w00_lo_v = _mm256_set_epi32(w00_1, w00_1, w00_1, w00_1, w00_0, w00_0, w00_0, w00_0); - __m256i w01_lo_v = _mm256_set_epi32(w01_1, w01_1, w01_1, w01_1, w01_0, w01_0, w01_0, w01_0); - __m256i w10_lo_v = _mm256_set_epi32(w10_1, w10_1, w10_1, w10_1, w10_0, w10_0, w10_0, w10_0); - __m256i w11_lo_v = _mm256_set_epi32(w11_1, w11_1, w11_1, w11_1, w11_0, w11_0, w11_0, w11_0); - - __m256i w00_hi_v = _mm256_set_epi32(w00_3, w00_3, w00_3, w00_3, w00_2, w00_2, w00_2, w00_2); - __m256i w01_hi_v = _mm256_set_epi32(w01_3, w01_3, w01_3, w01_3, w01_2, w01_2, w01_2, w01_2); - __m256i w10_hi_v = _mm256_set_epi32(w10_3, w10_3, w10_3, w10_3, w10_2, w10_2, w10_2, w10_2); - __m256i w11_hi_v = _mm256_set_epi32(w11_3, w11_3, w11_3, w11_3, w11_2, w11_2, w11_2, w11_2); - - // Multiplication & Accumulation - __m256i sum_lo = _mm256_mullo_epi32(p00_lo, w00_lo_v); - sum_lo = _mm256_add_epi32(sum_lo, _mm256_mullo_epi32(p01_lo, w01_lo_v)); - sum_lo = _mm256_add_epi32(sum_lo, _mm256_mullo_epi32(p10_lo, w10_lo_v)); - sum_lo = _mm256_add_epi32(sum_lo, _mm256_mullo_epi32(p11_lo, w11_lo_v)); - - __m256i sum_hi = _mm256_mullo_epi32(p00_hi, w00_hi_v); - sum_hi = _mm256_add_epi32(sum_hi, _mm256_mullo_epi32(p01_hi, w01_hi_v)); - sum_hi = _mm256_add_epi32(sum_hi, _mm256_mullo_epi32(p10_hi, w10_hi_v)); - sum_hi = _mm256_add_epi32(sum_hi, _mm256_mullo_epi32(p11_hi, w11_hi_v)); - - // Add Rounding Factor & Right Shift - __m256i vRound = _mm256_set1_epi32(kWeightRound); - sum_lo = _mm256_srai_epi32(_mm256_add_epi32(sum_lo, vRound), kWeightShift); - sum_hi = _mm256_srai_epi32(_mm256_add_epi32(sum_hi, vRound), kWeightShift); - - // Pack down to 16-bit - __m256i packed16 = _mm256_packs_epi32(sum_lo, sum_hi); - - // Pack down to 8-bit - __m256i packed8 = _mm256_packus_epi16(packed16, packed16); - - // Re-arrange into contiguous memory using lane extraction - __m128i p0_p2 = _mm256_castsi256_si128(packed8); - __m128i p1_p3 = _mm256_extracti128_si256(packed8, 1); - - *(uint32_t*)(pd + x * 4 + 0) = _mm_cvtsi128_si32(p0_p2); - *(uint32_t*)(pd + x * 4 + 4) = _mm_cvtsi128_si32(p1_p3); - *(uint32_t*)(pd + x * 4 + 8) = _mm_extract_epi32(p0_p2, 1); - *(uint32_t*)(pd + x * 4 + 12) = _mm_extract_epi32(p1_p3, 1); - } + float peak = 1.0f; + size_t i = 0; - // Scalar fallback for remaining pixels - for (; x < newW; ++x) { - const AxisCoeff xc = xCoeff[x]; - const uint8_t* s00 = row0 + static_cast(xc.idx0) * 4; - const uint8_t* s01 = row0 + static_cast(xc.idx1) * 4; - const uint8_t* s10 = row1 + static_cast(xc.idx0) * 4; - const uint8_t* s11 = row1 + static_cast(xc.idx1) * 4; - - const int w00 = xc.w0 * yc.w0; - const int w01 = xc.w1 * yc.w0; - const int w10 = xc.w0 * yc.w1; - const int w11 = xc.w1 * yc.w1; - - const size_t dstBase = static_cast(x) * 4; - pd[dstBase + 0] = static_cast((s00[0] * w00 + s01[0] * w01 + s10[0] * w10 + s11[0] * w11 + kWeightRound) >> kWeightShift); - pd[dstBase + 1] = static_cast((s00[1] * w00 + s01[1] * w01 + s10[1] * w10 + s11[1] * w11 + kWeightRound) >> kWeightShift); - pd[dstBase + 2] = static_cast((s00[2] * w00 + s01[2] * w01 + s10[2] * w10 + s11[2] * w11 + kWeightRound) >> kWeightShift); - pd[dstBase + 3] = static_cast((s00[3] * w00 + s01[3] * w01 + s10[3] * w10 + s11[3] * w11 + kWeightRound) >> kWeightShift); - } - } - } - // --- Peak Detection (HDR / Linear) --- - - static inline float _mm256_reduce_max_ps(__m256 v) { - __m128 lo = _mm256_castps256_ps128(v); - __m128 hi = _mm256_extractf128_ps(v, 1); - __m128 m1 = _mm_max_ps(lo, hi); - __m128 m2 = _mm_max_ps(m1, _mm_movehdup_ps(m1)); - m2 = _mm_max_ps(m2, _mm_movehl_ps(m2, m2)); - return _mm_cvtss_f32(m2); + const hn::ScalableTag df; + const size_t N = hn::Lanes(df); + const int step = (int)N; + + auto vPeak = hn::Set(df, 1.0f); + + for (; i + step <= pixelCount; i += step) { + auto vR = hn::Zero(df); + auto vG = hn::Zero(df); + auto vB = hn::Zero(df); + auto vA = hn::Zero(df); + + hn::LoadInterleaved4(df, pData + i * 4, vR, vG, vB, vA); + + auto m0 = hn::Max(vR, vG); + auto m1 = hn::Max(m0, vB); + + vPeak = hn::Max(vPeak, m1); } -#ifdef __AVX512F__ - static inline float _mm512_reduce_max_ps(__m512 v) { - __m256 lo = _mm512_castps512_ps256(v); - __m256 hi = _mm512_extractf32x8_ps(v, 1); - return _mm256_reduce_max_ps(_mm256_max_ps(lo, hi)); + auto vMaxAll = hn::MaxOfLanes(df, vPeak); + peak = hn::GetLane(vMaxAll); + + for (; i < pixelCount; ++i) { + float r = pData[i * 4 + 0]; + float g = pData[i * 4 + 1]; + float b = pData[i * 4 + 2]; + peak = (std::max)({peak, r, g, b}); } -#endif - /// - /// Fast Peak Detection for R32G32B32A32_FLOAT (Full Scan) - /// Scans the entire buffer for the maximum color component value. - /// Baseline is 1.0f (SDR white). - /// - inline float FindPeak_R32G32B32A32_FLOAT(const float* pData, size_t pixelCount) { - if (!pData || pixelCount == 0) return 1.0f; - - float peak = 1.0f; - size_t i = 0; - - // --- 1. Top of the line: AVX-512 Scan --- - if (HasAVX512F()) { -#ifdef __AVX512F__ - __m512 vPeak = _mm512_set1_ps(1.0f); - // Unroll 4x (16 pixels per loop) to saturate memory throughput - for (; i + 16 <= pixelCount; i += 16) { - __m512 p0 = _mm512_loadu_ps(pData + (i + 0) * 4); - __m512 p1 = _mm512_loadu_ps(pData + (i + 4) * 4); - __m512 p2 = _mm512_loadu_ps(pData + (i + 8) * 4); - __m512 p3 = _mm512_loadu_ps(pData + (i + 12) * 4); - vPeak = _mm512_max_ps(vPeak, _mm512_max_ps(p0, p1)); - vPeak = _mm512_max_ps(vPeak, _mm512_max_ps(p2, p3)); - } - // Tail - for (; i + 4 <= pixelCount; i += 4) { - vPeak = _mm512_max_ps(vPeak, _mm512_loadu_ps(pData + i * 4)); - } - peak = _mm512_reduce_max_ps(vPeak); -#endif - } - // --- 2. High Performance: AVX2 Scan --- - else { - __m256 vPeak = _mm256_setzero_ps(); - __m256 vMask = _mm256_setr_ps(1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f); // Mask out Alpha - - // Unroll 8x (16 pixels per loop) - for (; i + 16 <= pixelCount; i += 16) { - __m256 p0 = _mm256_loadu_ps(pData + (i + 0) * 4); - __m256 p1 = _mm256_loadu_ps(pData + (i + 2) * 4); - __m256 p2 = _mm256_loadu_ps(pData + (i + 4) * 4); - __m256 p3 = _mm256_loadu_ps(pData + (i + 6) * 4); - __m256 p4 = _mm256_loadu_ps(pData + (i + 8) * 4); - __m256 p5 = _mm256_loadu_ps(pData + (i + 10) * 4); - __m256 p6 = _mm256_loadu_ps(pData + (i + 12) * 4); - __m256 p7 = _mm256_loadu_ps(pData + (i + 14) * 4); - - // Keep only R, G, B - p0 = _mm256_mul_ps(p0, vMask); - p1 = _mm256_mul_ps(p1, vMask); - p2 = _mm256_mul_ps(p2, vMask); - p3 = _mm256_mul_ps(p3, vMask); - p4 = _mm256_mul_ps(p4, vMask); - p5 = _mm256_mul_ps(p5, vMask); - p6 = _mm256_mul_ps(p6, vMask); - p7 = _mm256_mul_ps(p7, vMask); - - __m256 m0 = _mm256_max_ps(p0, p1); - __m256 m1 = _mm256_max_ps(p2, p3); - __m256 m2 = _mm256_max_ps(p4, p5); - __m256 m3 = _mm256_max_ps(p6, p7); - - vPeak = _mm256_max_ps(vPeak, _mm256_max_ps(m0, m1)); - vPeak = _mm256_max_ps(vPeak, _mm256_max_ps(m2, m3)); - } - // Tail - for (; i + 2 <= pixelCount; i += 2) { - __m256 p = _mm256_loadu_ps(pData + i * 4); - vPeak = _mm256_max_ps(vPeak, _mm256_mul_ps(p, vMask)); - } - peak = _mm256_reduce_max_ps(vPeak); + return peak; +} + +struct AxisCoeff { int idx0, idx1, w0, w1; }; + +inline void ResizeBilinear_Impl(const uint8_t* src, int w, int h, int srcStride, uint8_t* dst, int newW, int newH, int dstStride) { + if (!src || !dst || w <= 0 || h <= 0 || newW <= 0 || newH <= 0) return; + if (srcStride == 0) srcStride = w * 4; + if (dstStride == 0) dstStride = newW * 4; + + constexpr int kWeightBits = 11; + constexpr int kWeightScale = 1 << kWeightBits; + constexpr int kWeightShift = kWeightBits * 2; + constexpr int kWeightRound = 1 << (kWeightShift - 1); + + auto buildAxisCoeff = [kWeightScale](int srcSize, int dstSize, std::vector& out) { + out.resize(dstSize); + if (srcSize <= 1 || dstSize <= 1) { + for (int i = 0; i < dstSize; ++i) { out[i].idx0 = 0; out[i].idx1 = 0; out[i].w0 = kWeightScale; out[i].w1 = 0; } + return; + } + const double scale = static_cast(srcSize - 1) / static_cast(dstSize - 1); + for (int i = 0; i < dstSize; ++i) { + double srcPos = static_cast(i) * scale; + int idx0 = (std::min)((std::max)(static_cast(srcPos), 0), srcSize - 1); + int idx1 = (std::min)(idx0 + 1, srcSize - 1); + double frac = srcPos - static_cast(idx0); + int w1 = (std::min)((std::max)(static_cast(frac * static_cast(kWeightScale) + 0.5), 0), kWeightScale); + out[i].idx0 = idx0; out[i].idx1 = idx1; out[i].w0 = kWeightScale - w1; out[i].w1 = w1; } - - // --- 3. Robust Fix: Scalar Fallback for partial pixels --- - for (; i < pixelCount; ++i) { - float r = pData[i * 4 + 0]; - float g = pData[i * 4 + 1]; - float b = pData[i * 4 + 2]; - peak = (std::max)({peak, r, g, b}); + }; + + std::vector xCoeff, yCoeff; + buildAxisCoeff(w, newW, xCoeff); + buildAxisCoeff(h, newH, yCoeff); + + for (int y = 0; y < newH; ++y) { + const AxisCoeff yc = yCoeff[y]; + const uint8_t* row0 = src + static_cast(yc.idx0) * static_cast(srcStride); + const uint8_t* row1 = src + static_cast(yc.idx1) * static_cast(srcStride); + uint8_t* pd = dst + static_cast(y) * static_cast(dstStride); + + int x = 0; + + for (; x < newW; ++x) { + const AxisCoeff xc = xCoeff[x]; + const uint8_t* s00 = row0 + static_cast(xc.idx0) * 4; + const uint8_t* s01 = row0 + static_cast(xc.idx1) * 4; + const uint8_t* s10 = row1 + static_cast(xc.idx0) * 4; + const uint8_t* s11 = row1 + static_cast(xc.idx1) * 4; + + const int w00 = xc.w0 * yc.w0; + const int w01 = xc.w1 * yc.w0; + const int w10 = xc.w0 * yc.w1; + const int w11 = xc.w1 * yc.w1; + + const size_t dstBase = static_cast(x) * 4; + pd[dstBase + 0] = static_cast((s00[0] * w00 + s01[0] * w01 + s10[0] * w10 + s11[0] * w11 + kWeightRound) >> kWeightShift); + pd[dstBase + 1] = static_cast((s00[1] * w00 + s01[1] * w01 + s10[1] * w10 + s11[1] * w11 + kWeightRound) >> kWeightShift); + pd[dstBase + 2] = static_cast((s00[2] * w00 + s01[2] * w01 + s10[2] * w10 + s11[2] * w11 + kWeightRound) >> kWeightShift); + pd[dstBase + 3] = static_cast((s00[3] * w00 + s01[3] * w01 + s10[3] * w10 + s11[3] * w11 + kWeightRound) >> kWeightShift); } - - return peak; } +} + +} // namespace HWY_NAMESPACE } // namespace SIMDUtils +HWY_AFTER_NAMESPACE(); +#if HWY_ONCE +namespace SIMDUtils { + inline void PremultiplyAlpha_BGRA(uint8_t* pData, int width, int height, int stride = 0) { + HWY_STATIC_DISPATCH(PremultiplyAlpha_BGRA_Impl)(pData, width, height, stride); + } + inline void SwizzleRGBA_to_BGRA_Premul(uint8_t* pData, size_t pixelCount) { + HWY_STATIC_DISPATCH(SwizzleRGBA_to_BGRA_Premul_Impl)(pData, pixelCount); + } + inline float FindPeak_R32G32B32A32_FLOAT(const float* pData, size_t pixelCount) { + return HWY_STATIC_DISPATCH(FindPeak_R32G32B32A32_FLOAT_Impl)(pData, pixelCount); + } + inline void ResizeBilinear(const uint8_t* src, int w, int h, int srcStride, uint8_t* dst, int newW, int newH, int dstStride = 0) { + HWY_STATIC_DISPATCH(ResizeBilinear_Impl)(src, w, h, srcStride, dst, newW, newH, dstStride); + } +} +#endif diff --git a/QuickView/SettingsOverlay.cpp b/QuickView/SettingsOverlay.cpp index 7e1e5eb..187ab8a 100644 --- a/QuickView/SettingsOverlay.cpp +++ b/QuickView/SettingsOverlay.cpp @@ -61,14 +61,6 @@ static std::wstring GetAppVersion() { return L"2.1.0"; // Fallback } -static bool CheckAVX2() { - int cpuInfo[4]; - __cpuid(cpuInfo, 0); - if (cpuInfo[0] < 7) return false; - __cpuidex(cpuInfo, 7, 0); - return (cpuInfo[1] & (1 << 5)) != 0; -} - // Helper to get Real Windows Version via RtlGetVersion typedef LONG (WINAPI *RtlGetVersionPtr)(PRTL_OSVERSIONINFOW); @@ -123,8 +115,7 @@ std::wstring GetSystemInfo() { if (si.wProcessorArchitecture == PROCESSOR_ARCHITECTURE_ARM64) arch = L"ARM64"; // 3. SIMD - std::wstring simd = L"SIMD: AVX2 [Active]"; // Default checked - if (!CheckAVX2()) simd = L"SIMD: SSE2"; + std::wstring simd = L"SIMD: Highway [Active]"; // Default checked return osVer + L" | " + arch + L" | " + simd; } @@ -1981,8 +1972,8 @@ void SettingsOverlay::Render(ID2D1DeviceContext* pRT, float winW, float winH) { D2D1_RECT_F textRect = D2D1::RectF(contentX, sysY, contentX + contentW, sysY + 20); - // Highlight "AVX2 [Active]" in Green - size_t pos = item.label.find(L"SIMD: AVX2 [Active]"); + // Highlight "Highway [Active]" in Green + size_t pos = item.label.find(L"SIMD: Highway [Active]"); if (pos != std::wstring::npos) { // Draw first part Gray std::wstring part1 = item.label.substr(0, pos); @@ -1990,7 +1981,7 @@ void SettingsOverlay::Render(ID2D1DeviceContext* pRT, float winW, float winH) { // Draw active part Green (Approx offset) D2D1_RECT_F avxRect = D2D1::RectF(contentX + 225.0f * s, sysY, contentX + contentW, sysY + 20.0f * s); - pRT->DrawText(L"SIMD: AVX2 [Active]", 19, m_textFormatItem.Get(), avxRect, m_brushSuccess.Get()); + pRT->DrawText(L"SIMD: Highway [Active]", 22, m_textFormatItem.Get(), avxRect, m_brushSuccess.Get()); } else { pRT->DrawText(item.label.c_str(), (UINT32)item.label.length(), m_textFormatItem.Get(), textRect, m_brushTextDim.Get()); } diff --git a/QuickView/main.cpp b/QuickView/main.cpp index d78dec2..35dd4da 100644 --- a/QuickView/main.cpp +++ b/QuickView/main.cpp @@ -149,27 +149,6 @@ static std::string GetAppVersionUTF8() { return "2.1.0"; } -static bool SupportsAvx2ByCpuid() { -#if defined(_M_X64) || defined(_M_IX86) - int cpuInfo[4] = {}; - __cpuid(cpuInfo, 0); - if (cpuInfo[0] < 7) return false; - - __cpuid(cpuInfo, 1); - const bool hasOsxsave = (cpuInfo[2] & (1 << 27)) != 0; - const bool hasAvx = (cpuInfo[2] & (1 << 28)) != 0; - if (!hasOsxsave || !hasAvx) return false; - - const unsigned long long xcr0 = _xgetbv(0); - if ((xcr0 & 0x6) != 0x6) return false; - - __cpuidex(cpuInfo, 7, 0); - return (cpuInfo[1] & (1 << 5)) != 0; -#else - return false; -#endif -} - // Function Prototypes static void SyncDCompState(HWND hwnd, float winW, float winH, bool animate); @@ -5630,20 +5609,6 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE, LPWSTR lpCmdLine, int nCmdSh } g_isMasterProcess = (routeResult == QuickView::ProcessRouter::RouteResult::BecameMaster); } - - // [v3.2.3] AVX2 Check - Critical: App compiled with /arch:AVX2, will crash without it - const bool hasAvx2 = IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE) || SupportsAvx2ByCpuid(); - if (!hasAvx2) { - MessageBoxW(nullptr, - L"QuickView requires a CPU with AVX2 support.\n\n" - L"Minimum Requirements:\n" - L"Intel: Core 4th Gen (Haswell, 2013) or later\n" - L"AMD: Ryzen (Zen, 2017) or later\n\n" - L"Your CPU does not support AVX2. The application cannot run.", - L"QuickView - Hardware Not Supported", - MB_OK | MB_ICONERROR); - return 1; - } AppStrings::Init(); diff --git a/fix.sh b/fix.sh new file mode 100644 index 0000000..b31b53f --- /dev/null +++ b/fix.sh @@ -0,0 +1 @@ +git add QuickView/ImageLoader.cpp QuickView/SIMDUtils.h QuickView/SettingsOverlay.cpp QuickView/main.cpp QuickView/QuickView.vcxproj diff --git a/test_main b/test_main new file mode 100755 index 0000000..6d6f3b2 Binary files /dev/null and b/test_main differ diff --git a/test_main.cpp b/test_main.cpp new file mode 100644 index 0000000..c25e100 --- /dev/null +++ b/test_main.cpp @@ -0,0 +1,30 @@ +#include "QuickView/SystemInfo.h" +#ifndef HWY_TARGETS +#if defined(_M_X64) || defined(__x86_64__) + #undef HWY_BASELINE_TARGETS + #define HWY_BASELINE_TARGETS (HWY_SSE4) +#endif +#endif +#include + +HWY_BEFORE_NAMESPACE(); +namespace SIMD_ImageLoader { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; +inline void ComputeHistRow(const uint8_t* row, int width, uint32_t* HistR, uint32_t* HistG, uint32_t* HistB, uint32_t* HistL, int& x_out) { +} +} +} +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace SIMD_ImageLoader { + HWY_EXPORT(ComputeHistRow); +} +#endif + +int main() { + int x_out = 0; + HWY_DYNAMIC_DISPATCH(SIMD_ImageLoader::ComputeHistRow)(nullptr, 0, nullptr, nullptr, nullptr, nullptr, x_out); + return 0; +}