diff --git a/QuickView/ImageLoader.cpp b/QuickView/ImageLoader.cpp
index 03bfe69..4906d35 100644
--- a/QuickView/ImageLoader.cpp
+++ b/QuickView/ImageLoader.cpp
@@ -9385,6 +9385,106 @@ HRESULT CImageLoader::ComputeHistogram(IWICBitmapSource* source, ImageMetadata*
 
     return E_FAIL;
 }
+
+#ifndef HWY_TARGETS
+#if defined(_M_X64) || defined(__x86_64__)
+    #undef HWY_BASELINE_TARGETS
+    #define HWY_BASELINE_TARGETS (HWY_SSE4)
+#endif
+#endif
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace SIMD_ImageLoader {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+inline void ComputeHistRow(const uint8_t* row, int width, uint32_t* HistR, uint32_t* HistG, uint32_t* HistB, uint32_t* HistL, int& x_out) {
+    int x = 0;
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint16_t> d16;
+    const hn::ScalableTag<uint32_t> d32;
+    const size_t N8 = hn::Lanes(d8);
+    const int step = (int)N8;
+
+    for (; x + step <= width; x += step) {
+        auto vB = hn::Zero(d8);
+        auto vG = hn::Zero(d8);
+        auto vR = hn::Zero(d8);
+        auto vA = hn::Zero(d8);
+        hn::LoadInterleaved4(d8, row + x * 4, vB, vG, vR, vA);
+
+        alignas(128) uint8_t b_arr[128];
+        alignas(128) uint8_t g_arr[128];
+        alignas(128) uint8_t r_arr[128];
+        hn::Store(vB, d8, b_arr);
+        hn::Store(vG, d8, g_arr);
+        hn::Store(vR, d8, r_arr);
+
+        auto r16_lo = hn::PromoteLowerTo(d16, vR);
+        auto r16_hi = hn::PromoteUpperTo(d16, vR);
+        auto g16_lo = hn::PromoteLowerTo(d16, vG);
+        auto g16_hi = hn::PromoteUpperTo(d16, vG);
+        auto b16_lo = hn::PromoteLowerTo(d16, vB);
+        auto b16_hi = hn::PromoteUpperTo(d16, vB);
+
+        auto r32_lo_lo = hn::PromoteLowerTo(d32, r16_lo);
+        auto r32_lo_hi = hn::PromoteUpperTo(d32, r16_lo);
+        auto r32_hi_lo = hn::PromoteLowerTo(d32, r16_hi);
+        auto r32_hi_hi = hn::PromoteUpperTo(d32, r16_hi);
+
+        auto g32_lo_lo = hn::PromoteLowerTo(d32, g16_lo);
+        auto g32_lo_hi = hn::PromoteUpperTo(d32, g16_lo);
+        auto g32_hi_lo = hn::PromoteLowerTo(d32, g16_hi);
+        auto g32_hi_hi = hn::PromoteUpperTo(d32, g16_hi);
+
+        auto b32_lo_lo = hn::PromoteLowerTo(d32, b16_lo);
+        auto b32_lo_hi = hn::PromoteUpperTo(d32, b16_lo);
+        auto b32_hi_lo = hn::PromoteLowerTo(d32, b16_hi);
+        auto b32_hi_hi = hn::PromoteUpperTo(d32, b16_hi);
+
+        auto c299 = hn::Set(d32, 299);
+        auto c587 = hn::Set(d32, 587);
+        auto c114 = hn::Set(d32, 114);
+        auto c500 = hn::Set(d32, 500);
+        auto c1000 = hn::Set(d32, 1000);
+
+        auto calc_l = [&](auto r, auto g, auto b) {
+            auto sum = hn::Add(hn::Add(hn::Add(hn::Mul(r, c299), hn::Mul(g, c587)), hn::Mul(b, c114)), c500);
+            return hn::Div(sum, c1000);
+        };
+
+        auto l_lo_lo = calc_l(r32_lo_lo, g32_lo_lo, b32_lo_lo);
+        auto l_lo_hi = calc_l(r32_lo_hi, g32_lo_hi, b32_lo_hi);
+        auto l_hi_lo = calc_l(r32_hi_lo, g32_hi_lo, b32_hi_lo);
+        auto l_hi_hi = calc_l(r32_hi_hi, g32_hi_hi, b32_hi_hi);
+
+        auto l16_lo = hn::OrderedDemote2To(d16, l_lo_lo, l_lo_hi);
+        auto l16_hi = hn::OrderedDemote2To(d16, l_hi_lo, l_hi_hi);
+        auto l8 = hn::OrderedDemote2To(d8, l16_lo, l16_hi);
+
+        alignas(128) uint8_t l_arr[128];
+        hn::Store(l8, d8, l_arr);
+
+        for (int j = 0; j < step; ++j) {
+            HistB[b_arr[j]]++;
+            HistG[g_arr[j]]++;
+            HistR[r_arr[j]]++;
+            HistL[l_arr[j]]++;
+        }
+    }
+    x_out = x;
+}
+} // HWY_NAMESPACE
+} // SIMD_ImageLoader
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace SIMD_ImageLoader {
+    HWY_EXPORT(ComputeHistRow);
+}
+#endif
+
 // [v5.2] Histogram from RawImageFrame (for HeavyLanePool pipeline)
 void CImageLoader::ComputeHistogramFromFrame(const QuickView::RawImageFrame& frame, ImageMetadata* pMetadata) {
     if (!frame.pixels || frame.width == 0 || frame.height == 0 || !pMetadata) return;
@@ -9411,93 +9511,16 @@ void CImageLoader::ComputeHistogramFromFrame(const QuickView::RawImageFrame& fra
     double lapSumSq = 0.0;
     uint64_t lapCount = 0;
     
-    // Assume BGRA8888 (standard for RawImageFrame)
     // Assume BGRA8888 (standard for RawImageFrame)
     for (UINT y = 0; y < frame.height; y += stepY) {
         const uint8_t* row = ptr + (UINT64)y * stride;
         UINT x = 0;
     
-        // Process 8 pixels per iteration.
-        const UINT width8 = frame.width & ~7u;
-
-#if QUICKVIEW_USE_STD_SIMD_HIST
-        using u32x8 = std::simd<uint32_t, std::simd_abi::fixed_size<8>>;
-        alignas(32) uint32_t bLane[8];
-        alignas(32) uint32_t gLane[8];
-        alignas(32) uint32_t rLane[8];
-        alignas(32) uint32_t lLane[8];
-
-        for (; x < width8; x += 8) {
-            const uint8_t* p = row + x * 4;
-            for (int i = 0; i < 8; ++i) {
-                const uint8_t* px = p + i * 4;
-                const uint32_t b = px[0];
-                const uint32_t g = px[1];
-                const uint32_t r = px[2];
-                bLane[i] = b;
-                gLane[i] = g;
-                rLane[i] = r;
-                pMetadata->HistB[b]++;
-                pMetadata->HistG[g]++;
-                pMetadata->HistR[r]++;
-            }
-
-            u32x8 vb;
-            u32x8 vg;
-            u32x8 vr;
-            vb.copy_from(bLane, std::element_aligned);
-            vg.copy_from(gLane, std::element_aligned);
-            vr.copy_from(rLane, std::element_aligned);
-
-            const u32x8 vl = (vr * 299u + vg * 587u + vb * 114u + 500u) / 1000u;
-            vl.copy_to(lLane, std::element_aligned);
-
-            for (int i = 0; i < 8; ++i) {
-                pMetadata->HistL[lLane[i]]++;
-            }
-        }
-#else
-        // [AVX2] SIMD Optimization for Luminance Calculation
-        const __m256i vCoeffs = _mm256_set1_epi64x(0x0000012B024B0072);
-        // Multiply by 8389 and shift by 23: 8389 / 2^23 = 8389 / 8388608 ≈ 0.000999999
-        // Max Luma = 255 * 1000 = 255000. Max mult = 255000 * 8389 = 2,139,195,000 < 2^31
-        const __m256i vMul = _mm256_set1_epi32(8389);
-
-        for (; x < width8; x += 8) {
-            __m256i vPixels = _mm256_loadu_si256((const __m256i*)(row + x * 4));
-            __m256i vPix03 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(vPixels));
-            __m256i vPix47 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(vPixels, 1));
-            __m256i vSum03 = _mm256_madd_epi16(vPix03, vCoeffs);
-            __m256i vSum47 = _mm256_madd_epi16(vPix47, vCoeffs);
-            __m256i vLuma03 = _mm256_hadd_epi32(vSum03, vSum03);
-            __m256i vLuma47 = _mm256_hadd_epi32(vSum47, vSum47);
-            __m256i vDiv03 = _mm256_srli_epi32(_mm256_mullo_epi32(vLuma03, vMul), 23);
-            __m256i vDiv47 = _mm256_srli_epi32(_mm256_mullo_epi32(vLuma47, vMul), 23);
-
-            uint32_t l0 = _mm256_cvtsi256_si32(vDiv03);
-            uint32_t l1 = _mm256_extract_epi32(vDiv03, 1);
-            uint32_t l2 = _mm256_extract_epi32(vDiv03, 4);
-            uint32_t l3 = _mm256_extract_epi32(vDiv03, 5);
-            uint32_t l4 = _mm256_cvtsi256_si32(vDiv47);
-            uint32_t l5 = _mm256_extract_epi32(vDiv47, 1);
-            uint32_t l6 = _mm256_extract_epi32(vDiv47, 4);
-            uint32_t l7 = _mm256_extract_epi32(vDiv47, 5);
-
-            const uint8_t* p = row + x * 4;
-            pMetadata->HistB[p[0]]++; pMetadata->HistG[p[1]]++; pMetadata->HistR[p[2]]++; pMetadata->HistL[l0]++;
-            pMetadata->HistB[p[4]]++; pMetadata->HistG[p[5]]++; pMetadata->HistR[p[6]]++; pMetadata->HistL[l1]++;
-            pMetadata->HistB[p[8]]++; pMetadata->HistG[p[9]]++; pMetadata->HistR[p[10]]++; pMetadata->HistL[l2]++;
-            pMetadata->HistB[p[12]]++; pMetadata->HistG[p[13]]++; pMetadata->HistR[p[14]]++; pMetadata->HistL[l3]++;
-            pMetadata->HistB[p[16]]++; pMetadata->HistG[p[17]]++; pMetadata->HistR[p[18]]++; pMetadata->HistL[l4]++;
-            pMetadata->HistB[p[20]]++; pMetadata->HistG[p[21]]++; pMetadata->HistR[p[22]]++; pMetadata->HistL[l5]++;
-            pMetadata->HistB[p[24]]++; pMetadata->HistG[p[25]]++; pMetadata->HistR[p[26]]++; pMetadata->HistL[l6]++;
-            pMetadata->HistB[p[28]]++; pMetadata->HistG[p[29]]++; pMetadata->HistR[p[30]]++; pMetadata->HistL[l7]++;
-        }
-#endif
+        int x_out = 0;
+        HWY_DYNAMIC_DISPATCH(SIMD_ImageLoader::ComputeHistRow)(row, frame.width, pMetadata->HistR.data(), pMetadata->HistG.data(), pMetadata->HistB.data(), pMetadata->HistL.data(), x_out);
+        x = (UINT)x_out;
 
     for (; x < frame.width; x++) {
-        // Unrolling or SIMD could be added here, but scalar is fast enough with skip sampling.
-        // Layout: B, G, R, A
         uint8_t b = row[x * 4 + 0];
         uint8_t g = row[x * 4 + 1];
         uint8_t r = row[x * 4 + 2];
diff --git a/QuickView/QuickView.vcxproj b/QuickView/QuickView.vcxproj
index 615ee6c..9b6e288 100644
--- a/QuickView/QuickView.vcxproj
+++ b/QuickView/QuickView.vcxproj
@@ -96,7 +96,6 @@
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
       <BufferSecurityCheck>false</BufferSecurityCheck>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <FloatingPointModel>Fast</FloatingPointModel>
       <LanguageStandard>stdcpplatest</LanguageStandard>
       <AdditionalOptions>/Ob2 /sdl- /utf-8 /Gw /Zc:inline /GR- %(AdditionalOptions)</AdditionalOptions>
diff --git a/QuickView/SIMDUtils.h b/QuickView/SIMDUtils.h
index 1ce41e6..4cde1f4 100644
--- a/QuickView/SIMDUtils.h
+++ b/QuickView/SIMDUtils.h
@@ -2,9 +2,9 @@
 #include <cstdint>
 #include <array>
 #include <algorithm>
-#include <immintrin.h>
 #include <vector>
 #include <string>
+
 #if defined(__has_include)
 #if __has_include(<simd>)
 #include <simd>
@@ -21,447 +21,266 @@
 #else
 #define QVIEW_SIMDUTILS_USE_STD_SIMD_RESIZE 0
 #endif
+
 #include "SystemInfo.h"
 
-namespace SIMDUtils {
-    
-    // One-time hardware detection (shared across all translation units)
-    inline bool HasAVX512F() {
-        return SystemInfo::Cached().hasAVX512F;
-    }
+// Highway dynamic dispatch configuration
+#ifndef HWY_TARGETS
+#if defined(_M_X64) || defined(__x86_64__)
+    #undef HWY_BASELINE_TARGETS
+    #define HWY_BASELINE_TARGETS (HWY_SSE4)
+#endif
+#endif
 
-    // Fast Premultiply Alpha
-    inline void PremultiplyAlpha_BGRA(uint8_t* pData, int width, int height, int stride = 0) {
-        if (stride == 0) stride = width * 4;
-        
-        const __m256i shuffleMask = _mm256_setr_epi8(
-            3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15,
-            3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15
-        );
-        const int blendMask = 0x88;
-
-        for (int y = 0; y < height; ++y) {
-            uint8_t* row = pData + (size_t)y * stride;
-            int x = 0;
-
-            // AVX-512 Loop
-            if (HasAVX512F()) {
-                const __m512i shuffleMask512 = _mm512_broadcast_i32x4(_mm_setr_epi8(
-                    3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15
-                ));
-                
-                for (; x <= width - 16; x += 16) {
-                    uint8_t* p = row + x * 4;
-                    __m512i src = _mm512_loadu_si512(p);
-                    __m512i alphas8 = _mm512_shuffle_epi8(src, shuffleMask512);
-                    
-                    __m512i pLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(src));
-                    __m512i pHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(src, 1));
-                    __m512i aLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(alphas8));
-                    __m512i aHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(alphas8, 1));
-                    
-                    __m512i mulLo = _mm512_srli_epi16(_mm512_mullo_epi16(pLo, aLo), 8);
-                    __m512i mulHi = _mm512_srli_epi16(_mm512_mullo_epi16(pHi, aHi), 8);
-                    
-                    mulLo = _mm512_mask_blend_epi16(0x88888888, mulLo, pLo);
-                    mulHi = _mm512_mask_blend_epi16(0x88888888, mulHi, pHi);
-                    
-                    __m512i packed = _mm512_packus_epi16(mulLo, mulHi);
-                    packed = _mm512_permutex_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
-                    _mm512_storeu_si512(p, packed);
-                }
-            }
+#include <hwy/highway.h>
 
-            // AVX-2 Loop
-            for (; x <= width - 8; x += 8) {
-                uint8_t* p = row + x * 4;
-                __m256i src = _mm256_loadu_si256((__m256i*)p);
-                __m256i alphas8 = _mm256_shuffle_epi8(src, shuffleMask);
-                __m256i pLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(src));
-                __m256i pHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(src, 1));
-                __m256i aLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(alphas8));
-                __m256i aHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(alphas8, 1));
-                
-                __m256i mulLo = _mm256_srli_epi16(_mm256_mullo_epi16(pLo, aLo), 8);
-                __m256i mulHi = _mm256_srli_epi16(_mm256_mullo_epi16(pHi, aHi), 8);
-                
-                mulLo = _mm256_blend_epi16(mulLo, pLo, blendMask);
-                mulHi = _mm256_blend_epi16(mulHi, pHi, blendMask);
-                
-                __m256i packed = _mm256_packus_epi16(mulLo, mulHi);
-                packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
-                _mm256_storeu_si256((__m256i*)p, packed);
-            }
+HWY_BEFORE_NAMESPACE();
+namespace SIMDUtils {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Fast Premultiply Alpha
+inline void PremultiplyAlpha_BGRA_Impl(uint8_t* pData, int width, int height, int stride) {
+    if (stride == 0) stride = width * 4;
+
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint16_t> d16;
+    const size_t N8 = hn::Lanes(d8);
+    const int step = (int)N8;
+
+    for (int y = 0; y < height; ++y) {
+        uint8_t* row = pData + (size_t)y * stride;
+        int x = 0;
+
+        for (; x + step <= width; x += step) {
+            uint8_t* p = row + x * 4;
+            auto vB = hn::Zero(d8);
+            auto vG = hn::Zero(d8);
+            auto vR = hn::Zero(d8);
+            auto vA = hn::Zero(d8);
             
-            // Scalar fallback
-            for (; x < width; ++x) {
-                uint8_t* px = row + x * 4;
-                uint8_t alpha = px[3];
-                if (alpha == 0) {
-                     px[0] = px[1] = px[2] = 0;
-                } else {
-                     px[0] = (uint8_t)((px[0] * alpha) >> 8);
-                     px[1] = (uint8_t)((px[1] * alpha) >> 8);
-                     px[2] = (uint8_t)((px[2] * alpha) >> 8);
-                }
-            }
+            hn::LoadInterleaved4(d8, p, vB, vG, vR, vA);
+
+            auto a16_lo = hn::PromoteLowerTo(d16, vA);
+            auto a16_hi = hn::PromoteUpperTo(d16, vA);
+
+            auto b16_lo = hn::PromoteLowerTo(d16, vB);
+            auto b16_hi = hn::PromoteUpperTo(d16, vB);
+            auto b16_lo_mul = hn::ShiftRight<8>(hn::Mul(b16_lo, a16_lo));
+            auto b16_hi_mul = hn::ShiftRight<8>(hn::Mul(b16_hi, a16_hi));
+            vB = hn::OrderedDemote2To(d8, b16_lo_mul, b16_hi_mul);
+
+            auto g16_lo = hn::PromoteLowerTo(d16, vG);
+            auto g16_hi = hn::PromoteUpperTo(d16, vG);
+            auto g16_lo_mul = hn::ShiftRight<8>(hn::Mul(g16_lo, a16_lo));
+            auto g16_hi_mul = hn::ShiftRight<8>(hn::Mul(g16_hi, a16_hi));
+            vG = hn::OrderedDemote2To(d8, g16_lo_mul, g16_hi_mul);
+
+            auto r16_lo = hn::PromoteLowerTo(d16, vR);
+            auto r16_hi = hn::PromoteUpperTo(d16, vR);
+            auto r16_lo_mul = hn::ShiftRight<8>(hn::Mul(r16_lo, a16_lo));
+            auto r16_hi_mul = hn::ShiftRight<8>(hn::Mul(r16_hi, a16_hi));
+            vR = hn::OrderedDemote2To(d8, r16_lo_mul, r16_hi_mul);
+
+            hn::StoreInterleaved4(vB, vG, vR, vA, d8, p);
         }
-    }
 
-    // SIMD Swizzle RGBA→BGRA with Alpha Optimization
-    inline void SwizzleRGBA_to_BGRA_Premul(uint8_t* pData, size_t pixelCount) {
-        uint8_t* p = pData;
-        size_t i = 0;
-        
-        // AVX-512 Swizzle
-        if (HasAVX512F()) {
-            const __m512i swizzleMask512 = _mm512_broadcast_i32x4(_mm_setr_epi8(
-                2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15
-            ));
-            const __m512i alphaMask512 = _mm512_broadcast_i32x4(_mm_setr_epi8(
-                3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15
-            ));
-            
-            for (; i + 16 <= pixelCount; i += 16) {
-                 __m512i src = _mm512_loadu_si512(p + i * 4);
-                 __m512i swizzled = _mm512_shuffle_epi8(src, swizzleMask512);
-                 __m512i alphas8 = _mm512_shuffle_epi8(src, alphaMask512);
-
-                __m512i pLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(swizzled));
-                __m512i pHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(swizzled, 1));
-                __m512i aLo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(alphas8));
-                __m512i aHi = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(alphas8, 1));
-
-                __m512i mulLo = _mm512_srli_epi16(_mm512_mullo_epi16(pLo, aLo), 8);
-                __m512i mulHi = _mm512_srli_epi16(_mm512_mullo_epi16(pHi, aHi), 8);
-                
-                mulLo = _mm512_mask_blend_epi16(0x88888888, mulLo, pLo);
-                mulHi = _mm512_mask_blend_epi16(0x88888888, mulHi, pHi);
-
-                __m512i packed = _mm512_packus_epi16(mulLo, mulHi);
-                packed = _mm512_permutex_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
-                _mm512_storeu_si512(p + i * 4, packed);
+        for (; x < width; ++x) {
+            uint8_t* px = row + x * 4;
+            uint8_t alpha = px[3];
+            if (alpha == 0) {
+                 px[0] = px[1] = px[2] = 0;
+            } else {
+                 px[0] = (uint8_t)((px[0] * alpha) >> 8);
+                 px[1] = (uint8_t)((px[1] * alpha) >> 8);
+                 px[2] = (uint8_t)((px[2] * alpha) >> 8);
             }
         }
+    }
+}
+
+// SIMD Swizzle RGBA to BGRA
+inline void SwizzleRGBA_to_BGRA_Premul_Impl(uint8_t* pData, size_t pixelCount) {
+    uint8_t* p = pData;
+    size_t i = 0;
+
+    const hn::ScalableTag<uint8_t> d8;
+    const hn::ScalableTag<uint16_t> d16;
+    const size_t N8 = hn::Lanes(d8);
+    const int step = (int)N8;
+
+    for (; i + step <= pixelCount; i += step) {
+        auto vR = hn::Zero(d8);
+        auto vG = hn::Zero(d8);
+        auto vB = hn::Zero(d8);
+        auto vA = hn::Zero(d8);
+
+        hn::LoadInterleaved4(d8, p + i * 4, vR, vG, vB, vA);
+
+        auto a16_lo = hn::PromoteLowerTo(d16, vA);
+        auto a16_hi = hn::PromoteUpperTo(d16, vA);
+
+        auto b16_lo = hn::PromoteLowerTo(d16, vB);
+        auto b16_hi = hn::PromoteUpperTo(d16, vB);
+        auto b16_lo_mul = hn::ShiftRight<8>(hn::Mul(b16_lo, a16_lo));
+        auto b16_hi_mul = hn::ShiftRight<8>(hn::Mul(b16_hi, a16_hi));
+        vB = hn::OrderedDemote2To(d8, b16_lo_mul, b16_hi_mul);
+
+        auto g16_lo = hn::PromoteLowerTo(d16, vG);
+        auto g16_hi = hn::PromoteUpperTo(d16, vG);
+        auto g16_lo_mul = hn::ShiftRight<8>(hn::Mul(g16_lo, a16_lo));
+        auto g16_hi_mul = hn::ShiftRight<8>(hn::Mul(g16_hi, a16_hi));
+        vG = hn::OrderedDemote2To(d8, g16_lo_mul, g16_hi_mul);
+
+        auto r16_lo = hn::PromoteLowerTo(d16, vR);
+        auto r16_hi = hn::PromoteUpperTo(d16, vR);
+        auto r16_lo_mul = hn::ShiftRight<8>(hn::Mul(r16_lo, a16_lo));
+        auto r16_hi_mul = hn::ShiftRight<8>(hn::Mul(r16_hi, a16_hi));
+        vR = hn::OrderedDemote2To(d8, r16_lo_mul, r16_hi_mul);
+
+        hn::StoreInterleaved4(vB, vG, vR, vA, d8, p + i * 4);
+    }
 
-        // AVX2 Swizzle
-        const __m256i swizzleMask = _mm256_setr_epi8(
-            2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
-            2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15
-        );
-        const __m256i alphaBroadcastMask = _mm256_setr_epi8(
-            3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15,
-            3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15
-        );
-        
-        for (; i + 8 <= pixelCount; i += 8) {
-            __m256i src = _mm256_loadu_si256((__m256i*)(p + i * 4));
-            __m256i swizzled = _mm256_shuffle_epi8(src, swizzleMask);
-            __m256i alphas8 = _mm256_shuffle_epi8(src, alphaBroadcastMask);
-            
-            __m256i pLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(swizzled));
-            __m256i pHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(swizzled, 1));
-            __m256i aLo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(alphas8));
-            __m256i aHi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(alphas8, 1));
-            
-            __m256i mulLo = _mm256_srli_epi16(_mm256_mullo_epi16(pLo, aLo), 8);
-            __m256i mulHi = _mm256_srli_epi16(_mm256_mullo_epi16(pHi, aHi), 8);
-            
-            mulLo = _mm256_blend_epi16(mulLo, pLo, 0x88);
-            mulHi = _mm256_blend_epi16(mulHi, pHi, 0x88);
-            
-            __m256i packed = _mm256_packus_epi16(mulLo, mulHi);
-            packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
-            _mm256_storeu_si256((__m256i*)(p + i * 4), packed);
-        }
-
-        // Scalar Fallback
-        for (; i < pixelCount; ++i) {
-             uint8_t r = p[i*4+0];
-             uint8_t g = p[i*4+1];
-             uint8_t b = p[i*4+2];
-             uint8_t a = p[i*4+3];
-             if (a == 255) {
-                 p[i*4+0] = b;
-                 p[i*4+2] = r;
-             } else if (a == 0) {
-                 p[i*4+0] = 0; p[i*4+1] = 0; p[i*4+2] = 0; 
-             } else {
-                 p[i*4+0] = (uint8_t)((b * a + 127) / 255);
-                 p[i*4+1] = (uint8_t)((g * a + 127) / 255);
-                 p[i*4+2] = (uint8_t)((r * a + 127) / 255);
-             }
-        }
+    for (; i < pixelCount; ++i) {
+         uint8_t r = p[i*4+0];
+         uint8_t g = p[i*4+1];
+         uint8_t b = p[i*4+2];
+         uint8_t a = p[i*4+3];
+         if (a == 255) {
+             p[i*4+0] = b;
+             p[i*4+2] = r;
+         } else if (a == 0) {
+             p[i*4+0] = 0; p[i*4+1] = 0; p[i*4+2] = 0;
+         } else {
+             p[i*4+0] = (uint8_t)((b * a + 127) / 255);
+             p[i*4+1] = (uint8_t)((g * a + 127) / 255);
+             p[i*4+2] = (uint8_t)((r * a + 127) / 255);
+         }
     }
+}
 
-    // Bilinear Resize with precomputed coefficients + optional std::simd channel path
-    inline void ResizeBilinear(const uint8_t* src, int w, int h, int srcStride, uint8_t* dst, int newW, int newH, int dstStride = 0) {
-        if (!src || !dst || w <= 0 || h <= 0 || newW <= 0 || newH <= 0) return;
-        if (srcStride == 0) srcStride = w * 4;
-        if (dstStride == 0) dstStride = newW * 4;
-
-        constexpr int kWeightBits = 11;
-        constexpr int kWeightScale = 1 << kWeightBits;      // 2048
-        constexpr int kWeightShift = kWeightBits * 2;       // 22
-        constexpr int kWeightRound = 1 << (kWeightShift - 1);
-
-        struct AxisCoeff {
-            int idx0;
-            int idx1;
-            int w0;
-            int w1;
-        };
-
-        auto buildAxisCoeff = [kWeightScale](int srcSize, int dstSize, std::vector<AxisCoeff>& out) {
-            out.resize(dstSize);
-            if (srcSize <= 1 || dstSize <= 1) {
-                for (int i = 0; i < dstSize; ++i) {
-                    out[i].idx0 = 0;
-                    out[i].idx1 = 0;
-                    out[i].w0 = kWeightScale;
-                    out[i].w1 = 0;
-                }
-                return;
-            }
+// Find Peak
+inline float FindPeak_R32G32B32A32_FLOAT_Impl(const float* pData, size_t pixelCount) {
+    if (!pData || pixelCount == 0) return 1.0f;
 
-            const double scale = static_cast<double>(srcSize - 1) / static_cast<double>(dstSize - 1);
-            for (int i = 0; i < dstSize; ++i) {
-                const double srcPos = static_cast<double>(i) * scale;
-                int idx0 = static_cast<int>(srcPos);
-                idx0 = (std::clamp)(idx0, 0, srcSize - 1);
-                const int idx1 = (std::min)(idx0 + 1, srcSize - 1);
-
-                const double frac = srcPos - static_cast<double>(idx0);
-                int w1 = static_cast<int>(frac * static_cast<double>(kWeightScale) + 0.5);
-                w1 = (std::clamp)(w1, 0, kWeightScale);
-                const int w0 = kWeightScale - w1;
-
-                out[i].idx0 = idx0;
-                out[i].idx1 = idx1;
-                out[i].w0 = w0;
-                out[i].w1 = w1;
-            }
-        };
-
-        std::vector<AxisCoeff> xCoeff;
-        std::vector<AxisCoeff> yCoeff;
-        buildAxisCoeff(w, newW, xCoeff);
-        buildAxisCoeff(h, newH, yCoeff);
-
-        // Process rows
-        for (int y = 0; y < newH; ++y) {
-            const AxisCoeff yc = yCoeff[y];
-            const uint8_t* row0 = src + static_cast<size_t>(yc.idx0) * static_cast<size_t>(srcStride);
-            const uint8_t* row1 = src + static_cast<size_t>(yc.idx1) * static_cast<size_t>(srcStride);
-            uint8_t* pd = dst + static_cast<size_t>(y) * static_cast<size_t>(dstStride);
-            
-            int x = 0;
-            
-            // AVX2 unrolled 4-pixel loop using 32-bit math for absolute precision
-            for (; x + 3 < newW; x += 4) {
-                // Fetch indices
-                const int i00 = xCoeff[x+0].idx0 * 4; const int i01 = xCoeff[x+0].idx1 * 4;
-                const int i10 = xCoeff[x+1].idx0 * 4; const int i11 = xCoeff[x+1].idx1 * 4;
-                const int i20 = xCoeff[x+2].idx0 * 4; const int i21 = xCoeff[x+2].idx1 * 4;
-                const int i30 = xCoeff[x+3].idx0 * 4; const int i31 = xCoeff[x+3].idx1 * 4;
-
-                // Load 4 pixels per variable
-                __m128i v_s00 = _mm_set_epi32(*(const uint32_t*)(row0 + i30), *(const uint32_t*)(row0 + i20), *(const uint32_t*)(row0 + i10), *(const uint32_t*)(row0 + i00));
-                __m128i v_s01 = _mm_set_epi32(*(const uint32_t*)(row0 + i31), *(const uint32_t*)(row0 + i21), *(const uint32_t*)(row0 + i11), *(const uint32_t*)(row0 + i01));
-                __m128i v_s10 = _mm_set_epi32(*(const uint32_t*)(row1 + i30), *(const uint32_t*)(row1 + i20), *(const uint32_t*)(row1 + i10), *(const uint32_t*)(row1 + i00));
-                __m128i v_s11 = _mm_set_epi32(*(const uint32_t*)(row1 + i31), *(const uint32_t*)(row1 + i21), *(const uint32_t*)(row1 + i11), *(const uint32_t*)(row1 + i01));
-
-                // Unpack to 32-bit floats
-                __m256i p00_lo = _mm256_cvtepu8_epi32(v_s00);
-                __m256i p00_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s00, 8));
-                __m256i p01_lo = _mm256_cvtepu8_epi32(v_s01);
-                __m256i p01_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s01, 8));
-                __m256i p10_lo = _mm256_cvtepu8_epi32(v_s10);
-                __m256i p10_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s10, 8));
-                __m256i p11_lo = _mm256_cvtepu8_epi32(v_s11);
-                __m256i p11_hi = _mm256_cvtepu8_epi32(_mm_srli_si128(v_s11, 8));
-
-                // Weights calculation
-                int32_t w00_0 = xCoeff[x+0].w0 * yc.w0; int32_t w01_0 = xCoeff[x+0].w1 * yc.w0;
-                int32_t w10_0 = xCoeff[x+0].w0 * yc.w1; int32_t w11_0 = xCoeff[x+0].w1 * yc.w1;
-
-                int32_t w00_1 = xCoeff[x+1].w0 * yc.w0; int32_t w01_1 = xCoeff[x+1].w1 * yc.w0;
-                int32_t w10_1 = xCoeff[x+1].w0 * yc.w1; int32_t w11_1 = xCoeff[x+1].w1 * yc.w1;
-
-                int32_t w00_2 = xCoeff[x+2].w0 * yc.w0; int32_t w01_2 = xCoeff[x+2].w1 * yc.w0;
-                int32_t w10_2 = xCoeff[x+2].w0 * yc.w1; int32_t w11_2 = xCoeff[x+2].w1 * yc.w1;
-
-                int32_t w00_3 = xCoeff[x+3].w0 * yc.w0; int32_t w01_3 = xCoeff[x+3].w1 * yc.w0;
-                int32_t w10_3 = xCoeff[x+3].w0 * yc.w1; int32_t w11_3 = xCoeff[x+3].w1 * yc.w1;
-
-                // Broadcast weights across pixel channels
-                __m256i w00_lo_v = _mm256_set_epi32(w00_1, w00_1, w00_1, w00_1, w00_0, w00_0, w00_0, w00_0);
-                __m256i w01_lo_v = _mm256_set_epi32(w01_1, w01_1, w01_1, w01_1, w01_0, w01_0, w01_0, w01_0);
-                __m256i w10_lo_v = _mm256_set_epi32(w10_1, w10_1, w10_1, w10_1, w10_0, w10_0, w10_0, w10_0);
-                __m256i w11_lo_v = _mm256_set_epi32(w11_1, w11_1, w11_1, w11_1, w11_0, w11_0, w11_0, w11_0);
-
-                __m256i w00_hi_v = _mm256_set_epi32(w00_3, w00_3, w00_3, w00_3, w00_2, w00_2, w00_2, w00_2);
-                __m256i w01_hi_v = _mm256_set_epi32(w01_3, w01_3, w01_3, w01_3, w01_2, w01_2, w01_2, w01_2);
-                __m256i w10_hi_v = _mm256_set_epi32(w10_3, w10_3, w10_3, w10_3, w10_2, w10_2, w10_2, w10_2);
-                __m256i w11_hi_v = _mm256_set_epi32(w11_3, w11_3, w11_3, w11_3, w11_2, w11_2, w11_2, w11_2);
-
-                // Multiplication & Accumulation
-                __m256i sum_lo = _mm256_mullo_epi32(p00_lo, w00_lo_v);
-                sum_lo = _mm256_add_epi32(sum_lo, _mm256_mullo_epi32(p01_lo, w01_lo_v));
-                sum_lo = _mm256_add_epi32(sum_lo, _mm256_mullo_epi32(p10_lo, w10_lo_v));
-                sum_lo = _mm256_add_epi32(sum_lo, _mm256_mullo_epi32(p11_lo, w11_lo_v));
-
-                __m256i sum_hi = _mm256_mullo_epi32(p00_hi, w00_hi_v);
-                sum_hi = _mm256_add_epi32(sum_hi, _mm256_mullo_epi32(p01_hi, w01_hi_v));
-                sum_hi = _mm256_add_epi32(sum_hi, _mm256_mullo_epi32(p10_hi, w10_hi_v));
-                sum_hi = _mm256_add_epi32(sum_hi, _mm256_mullo_epi32(p11_hi, w11_hi_v));
-
-                // Add Rounding Factor & Right Shift
-                __m256i vRound = _mm256_set1_epi32(kWeightRound);
-                sum_lo = _mm256_srai_epi32(_mm256_add_epi32(sum_lo, vRound), kWeightShift);
-                sum_hi = _mm256_srai_epi32(_mm256_add_epi32(sum_hi, vRound), kWeightShift);
-
-                // Pack down to 16-bit
-                __m256i packed16 = _mm256_packs_epi32(sum_lo, sum_hi);
-                
-                // Pack down to 8-bit
-                __m256i packed8 = _mm256_packus_epi16(packed16, packed16);
-
-                // Re-arrange into contiguous memory using lane extraction
-                __m128i p0_p2 = _mm256_castsi256_si128(packed8);
-                __m128i p1_p3 = _mm256_extracti128_si256(packed8, 1);
-
-                *(uint32_t*)(pd + x * 4 + 0) = _mm_cvtsi128_si32(p0_p2);
-                *(uint32_t*)(pd + x * 4 + 4) = _mm_cvtsi128_si32(p1_p3);
-                *(uint32_t*)(pd + x * 4 + 8) = _mm_extract_epi32(p0_p2, 1);
-                *(uint32_t*)(pd + x * 4 + 12) = _mm_extract_epi32(p1_p3, 1);
-            }
+    float peak = 1.0f;
+    size_t i = 0;
 
-            // Scalar fallback for remaining pixels
-            for (; x < newW; ++x) {
-                const AxisCoeff xc = xCoeff[x];
-                const uint8_t* s00 = row0 + static_cast<size_t>(xc.idx0) * 4;
-                const uint8_t* s01 = row0 + static_cast<size_t>(xc.idx1) * 4;
-                const uint8_t* s10 = row1 + static_cast<size_t>(xc.idx0) * 4;
-                const uint8_t* s11 = row1 + static_cast<size_t>(xc.idx1) * 4;
-
-                const int w00 = xc.w0 * yc.w0;
-                const int w01 = xc.w1 * yc.w0;
-                const int w10 = xc.w0 * yc.w1;
-                const int w11 = xc.w1 * yc.w1;
-
-                const size_t dstBase = static_cast<size_t>(x) * 4;
-                pd[dstBase + 0] = static_cast<uint8_t>((s00[0] * w00 + s01[0] * w01 + s10[0] * w10 + s11[0] * w11 + kWeightRound) >> kWeightShift);
-                pd[dstBase + 1] = static_cast<uint8_t>((s00[1] * w00 + s01[1] * w01 + s10[1] * w10 + s11[1] * w11 + kWeightRound) >> kWeightShift);
-                pd[dstBase + 2] = static_cast<uint8_t>((s00[2] * w00 + s01[2] * w01 + s10[2] * w10 + s11[2] * w11 + kWeightRound) >> kWeightShift);
-                pd[dstBase + 3] = static_cast<uint8_t>((s00[3] * w00 + s01[3] * w01 + s10[3] * w10 + s11[3] * w11 + kWeightRound) >> kWeightShift);
-            }
-        }
-    }
-    // --- Peak Detection (HDR / Linear) ---
-
-    static inline float _mm256_reduce_max_ps(__m256 v) {
-        __m128 lo = _mm256_castps256_ps128(v);
-        __m128 hi = _mm256_extractf128_ps(v, 1);
-        __m128 m1 = _mm_max_ps(lo, hi);
-        __m128 m2 = _mm_max_ps(m1, _mm_movehdup_ps(m1));
-        m2 = _mm_max_ps(m2, _mm_movehl_ps(m2, m2));
-        return _mm_cvtss_f32(m2);
+    const hn::ScalableTag<float> df;
+    const size_t N = hn::Lanes(df);
+    const int step = (int)N;
+
+    auto vPeak = hn::Set(df, 1.0f);
+
+    for (; i + step <= pixelCount; i += step) {
+        auto vR = hn::Zero(df);
+        auto vG = hn::Zero(df);
+        auto vB = hn::Zero(df);
+        auto vA = hn::Zero(df);
+
+        hn::LoadInterleaved4(df, pData + i * 4, vR, vG, vB, vA);
+
+        auto m0 = hn::Max(vR, vG);
+        auto m1 = hn::Max(m0, vB);
+
+        vPeak = hn::Max(vPeak, m1);
     }
 
-#ifdef __AVX512F__
-    static inline float _mm512_reduce_max_ps(__m512 v) {
-        __m256 lo = _mm512_castps512_ps256(v);
-        __m256 hi = _mm512_extractf32x8_ps(v, 1);
-        return _mm256_reduce_max_ps(_mm256_max_ps(lo, hi));
+    auto vMaxAll = hn::MaxOfLanes(df, vPeak);
+    peak = hn::GetLane(vMaxAll);
+
+    for (; i < pixelCount; ++i) {
+        float r = pData[i * 4 + 0];
+        float g = pData[i * 4 + 1];
+        float b = pData[i * 4 + 2];
+        peak = (std::max)({peak, r, g, b});
     }
-#endif
 
-    /// <summary>
-    /// Fast Peak Detection for R32G32B32A32_FLOAT (Full Scan)
-    /// Scans the entire buffer for the maximum color component value.
-    /// Baseline is 1.0f (SDR white).
-    /// </summary>
-    inline float FindPeak_R32G32B32A32_FLOAT(const float* pData, size_t pixelCount) {
-        if (!pData || pixelCount == 0) return 1.0f;
-        
-        float peak = 1.0f;
-        size_t i = 0;
-        
-        // --- 1. Top of the line: AVX-512 Scan ---
-        if (HasAVX512F()) {
-#ifdef __AVX512F__
-            __m512 vPeak = _mm512_set1_ps(1.0f);
-            // Unroll 4x (16 pixels per loop) to saturate memory throughput
-            for (; i + 16 <= pixelCount; i += 16) {
-                __m512 p0 = _mm512_loadu_ps(pData + (i + 0) * 4);
-                __m512 p1 = _mm512_loadu_ps(pData + (i + 4) * 4);
-                __m512 p2 = _mm512_loadu_ps(pData + (i + 8) * 4);
-                __m512 p3 = _mm512_loadu_ps(pData + (i + 12) * 4);
-                vPeak = _mm512_max_ps(vPeak, _mm512_max_ps(p0, p1));
-                vPeak = _mm512_max_ps(vPeak, _mm512_max_ps(p2, p3));
-            }
-            // Tail
-            for (; i + 4 <= pixelCount; i += 4) {
-                vPeak = _mm512_max_ps(vPeak, _mm512_loadu_ps(pData + i * 4));
-            }
-            peak = _mm512_reduce_max_ps(vPeak);
-#endif
-        } 
-        // --- 2. High Performance: AVX2 Scan ---
-        else {
-            __m256 vPeak = _mm256_setzero_ps();
-            __m256 vMask = _mm256_setr_ps(1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f); // Mask out Alpha
-            
-            // Unroll 8x (16 pixels per loop)
-            for (; i + 16 <= pixelCount; i += 16) {
-                __m256 p0 = _mm256_loadu_ps(pData + (i + 0) * 4);
-                __m256 p1 = _mm256_loadu_ps(pData + (i + 2) * 4);
-                __m256 p2 = _mm256_loadu_ps(pData + (i + 4) * 4);
-                __m256 p3 = _mm256_loadu_ps(pData + (i + 6) * 4);
-                __m256 p4 = _mm256_loadu_ps(pData + (i + 8) * 4);
-                __m256 p5 = _mm256_loadu_ps(pData + (i + 10) * 4);
-                __m256 p6 = _mm256_loadu_ps(pData + (i + 12) * 4);
-                __m256 p7 = _mm256_loadu_ps(pData + (i + 14) * 4);
-                
-                // Keep only R, G, B
-                p0 = _mm256_mul_ps(p0, vMask);
-                p1 = _mm256_mul_ps(p1, vMask);
-                p2 = _mm256_mul_ps(p2, vMask);
-                p3 = _mm256_mul_ps(p3, vMask);
-                p4 = _mm256_mul_ps(p4, vMask);
-                p5 = _mm256_mul_ps(p5, vMask);
-                p6 = _mm256_mul_ps(p6, vMask);
-                p7 = _mm256_mul_ps(p7, vMask);
-
-                __m256 m0 = _mm256_max_ps(p0, p1);
-                __m256 m1 = _mm256_max_ps(p2, p3);
-                __m256 m2 = _mm256_max_ps(p4, p5);
-                __m256 m3 = _mm256_max_ps(p6, p7);
-                
-                vPeak = _mm256_max_ps(vPeak, _mm256_max_ps(m0, m1));
-                vPeak = _mm256_max_ps(vPeak, _mm256_max_ps(m2, m3));
-            }
-            // Tail
-            for (; i + 2 <= pixelCount; i += 2) {
-                __m256 p = _mm256_loadu_ps(pData + i * 4);
-                vPeak = _mm256_max_ps(vPeak, _mm256_mul_ps(p, vMask));
-            }
-            peak = _mm256_reduce_max_ps(vPeak);
+    return peak;
+}
+
+struct AxisCoeff { int idx0, idx1, w0, w1; };
+
+inline void ResizeBilinear_Impl(const uint8_t* src, int w, int h, int srcStride, uint8_t* dst, int newW, int newH, int dstStride) {
+    if (!src || !dst || w <= 0 || h <= 0 || newW <= 0 || newH <= 0) return;
+    if (srcStride == 0) srcStride = w * 4;
+    if (dstStride == 0) dstStride = newW * 4;
+
+    constexpr int kWeightBits = 11;
+    constexpr int kWeightScale = 1 << kWeightBits;
+    constexpr int kWeightShift = kWeightBits * 2;
+    constexpr int kWeightRound = 1 << (kWeightShift - 1);
+
+    auto buildAxisCoeff = [kWeightScale](int srcSize, int dstSize, std::vector<AxisCoeff>& out) {
+        out.resize(dstSize);
+        if (srcSize <= 1 || dstSize <= 1) {
+            for (int i = 0; i < dstSize; ++i) { out[i].idx0 = 0; out[i].idx1 = 0; out[i].w0 = kWeightScale; out[i].w1 = 0; }
+            return;
+        }
+        const double scale = static_cast<double>(srcSize - 1) / static_cast<double>(dstSize - 1);
+        for (int i = 0; i < dstSize; ++i) {
+            double srcPos = static_cast<double>(i) * scale;
+            int idx0 = (std::min)((std::max)(static_cast<int>(srcPos), 0), srcSize - 1);
+            int idx1 = (std::min)(idx0 + 1, srcSize - 1);
+            double frac = srcPos - static_cast<double>(idx0);
+            int w1 = (std::min)((std::max)(static_cast<int>(frac * static_cast<double>(kWeightScale) + 0.5), 0), kWeightScale);
+            out[i].idx0 = idx0; out[i].idx1 = idx1; out[i].w0 = kWeightScale - w1; out[i].w1 = w1;
         }
-        
-        // --- 3. Robust Fix: Scalar Fallback for partial pixels ---
-        for (; i < pixelCount; ++i) {
-            float r = pData[i * 4 + 0];
-            float g = pData[i * 4 + 1];
-            float b = pData[i * 4 + 2];
-            peak = (std::max)({peak, r, g, b});
+    };
+
+    std::vector<AxisCoeff> xCoeff, yCoeff;
+    buildAxisCoeff(w, newW, xCoeff);
+    buildAxisCoeff(h, newH, yCoeff);
+
+    for (int y = 0; y < newH; ++y) {
+        const AxisCoeff yc = yCoeff[y];
+        const uint8_t* row0 = src + static_cast<size_t>(yc.idx0) * static_cast<size_t>(srcStride);
+        const uint8_t* row1 = src + static_cast<size_t>(yc.idx1) * static_cast<size_t>(srcStride);
+        uint8_t* pd = dst + static_cast<size_t>(y) * static_cast<size_t>(dstStride);
+
+        int x = 0;
+
+        for (; x < newW; ++x) {
+            const AxisCoeff xc = xCoeff[x];
+            const uint8_t* s00 = row0 + static_cast<size_t>(xc.idx0) * 4;
+            const uint8_t* s01 = row0 + static_cast<size_t>(xc.idx1) * 4;
+            const uint8_t* s10 = row1 + static_cast<size_t>(xc.idx0) * 4;
+            const uint8_t* s11 = row1 + static_cast<size_t>(xc.idx1) * 4;
+
+            const int w00 = xc.w0 * yc.w0;
+            const int w01 = xc.w1 * yc.w0;
+            const int w10 = xc.w0 * yc.w1;
+            const int w11 = xc.w1 * yc.w1;
+
+            const size_t dstBase = static_cast<size_t>(x) * 4;
+            pd[dstBase + 0] = static_cast<uint8_t>((s00[0] * w00 + s01[0] * w01 + s10[0] * w10 + s11[0] * w11 + kWeightRound) >> kWeightShift);
+            pd[dstBase + 1] = static_cast<uint8_t>((s00[1] * w00 + s01[1] * w01 + s10[1] * w10 + s11[1] * w11 + kWeightRound) >> kWeightShift);
+            pd[dstBase + 2] = static_cast<uint8_t>((s00[2] * w00 + s01[2] * w01 + s10[2] * w10 + s11[2] * w11 + kWeightRound) >> kWeightShift);
+            pd[dstBase + 3] = static_cast<uint8_t>((s00[3] * w00 + s01[3] * w01 + s10[3] * w10 + s11[3] * w11 + kWeightRound) >> kWeightShift);
         }
-        
-        return peak;
     }
+}
+
+} // namespace HWY_NAMESPACE
 } // namespace SIMDUtils
+HWY_AFTER_NAMESPACE();
 
+#if HWY_ONCE
+namespace SIMDUtils {
+    inline void PremultiplyAlpha_BGRA(uint8_t* pData, int width, int height, int stride = 0) {
+        HWY_STATIC_DISPATCH(PremultiplyAlpha_BGRA_Impl)(pData, width, height, stride);
+    }
 
+    inline void SwizzleRGBA_to_BGRA_Premul(uint8_t* pData, size_t pixelCount) {
+        HWY_STATIC_DISPATCH(SwizzleRGBA_to_BGRA_Premul_Impl)(pData, pixelCount);
+    }
 
+    inline float FindPeak_R32G32B32A32_FLOAT(const float* pData, size_t pixelCount) {
+        return HWY_STATIC_DISPATCH(FindPeak_R32G32B32A32_FLOAT_Impl)(pData, pixelCount);
+    }
 
+    inline void ResizeBilinear(const uint8_t* src, int w, int h, int srcStride, uint8_t* dst, int newW, int newH, int dstStride = 0) {
+        HWY_STATIC_DISPATCH(ResizeBilinear_Impl)(src, w, h, srcStride, dst, newW, newH, dstStride);
+    }
+}
+#endif
diff --git a/QuickView/SettingsOverlay.cpp b/QuickView/SettingsOverlay.cpp
index 7e1e5eb..187ab8a 100644
--- a/QuickView/SettingsOverlay.cpp
+++ b/QuickView/SettingsOverlay.cpp
@@ -61,14 +61,6 @@ static std::wstring GetAppVersion() {
     return L"2.1.0"; // Fallback
 }
 
-static bool CheckAVX2() {
-    int cpuInfo[4];
-    __cpuid(cpuInfo, 0);
-    if (cpuInfo[0] < 7) return false;
-    __cpuidex(cpuInfo, 7, 0);
-    return (cpuInfo[1] & (1 << 5)) != 0; 
-}
-
 // Helper to get Real Windows Version via RtlGetVersion
 typedef LONG (WINAPI *RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
 
@@ -123,8 +115,7 @@ std::wstring GetSystemInfo() {
     if (si.wProcessorArchitecture == PROCESSOR_ARCHITECTURE_ARM64) arch = L"ARM64";
 
     // 3. SIMD
-    std::wstring simd = L"SIMD: AVX2 [Active]"; // Default checked
-    if (!CheckAVX2()) simd = L"SIMD: SSE2";
+    std::wstring simd = L"SIMD: Highway [Active]"; // Default checked
 
     return osVer + L" | " + arch + L" | " + simd;
 }
@@ -1981,8 +1972,8 @@ void SettingsOverlay::Render(ID2D1DeviceContext* pRT, float winW, float winH) {
                 
                  D2D1_RECT_F textRect = D2D1::RectF(contentX, sysY, contentX + contentW, sysY + 20);
                  
-                 // Highlight "AVX2 [Active]" in Green
-                 size_t pos = item.label.find(L"SIMD: AVX2 [Active]");
+                 // Highlight "Highway [Active]" in Green
+                 size_t pos = item.label.find(L"SIMD: Highway [Active]");
                  if (pos != std::wstring::npos) {
                      // Draw first part Gray
                      std::wstring part1 = item.label.substr(0, pos);
@@ -1990,7 +1981,7 @@ void SettingsOverlay::Render(ID2D1DeviceContext* pRT, float winW, float winH) {
                      
                      // Draw active part Green (Approx offset)
                      D2D1_RECT_F avxRect = D2D1::RectF(contentX + 225.0f * s, sysY, contentX + contentW, sysY + 20.0f * s);
-                     pRT->DrawText(L"SIMD: AVX2 [Active]", 19, m_textFormatItem.Get(), avxRect, m_brushSuccess.Get());
+                     pRT->DrawText(L"SIMD: Highway [Active]", 22, m_textFormatItem.Get(), avxRect, m_brushSuccess.Get());
                  } else {
                      pRT->DrawText(item.label.c_str(), (UINT32)item.label.length(), m_textFormatItem.Get(), textRect, m_brushTextDim.Get());
                  }
diff --git a/QuickView/main.cpp b/QuickView/main.cpp
index d78dec2..35dd4da 100644
--- a/QuickView/main.cpp
+++ b/QuickView/main.cpp
@@ -149,27 +149,6 @@ static std::string GetAppVersionUTF8() {
     return "2.1.0";
 }
 
-static bool SupportsAvx2ByCpuid() {
-#if defined(_M_X64) || defined(_M_IX86)
-    int cpuInfo[4] = {};
-    __cpuid(cpuInfo, 0);
-    if (cpuInfo[0] < 7) return false;
-
-    __cpuid(cpuInfo, 1);
-    const bool hasOsxsave = (cpuInfo[2] & (1 << 27)) != 0;
-    const bool hasAvx = (cpuInfo[2] & (1 << 28)) != 0;
-    if (!hasOsxsave || !hasAvx) return false;
-
-    const unsigned long long xcr0 = _xgetbv(0);
-    if ((xcr0 & 0x6) != 0x6) return false;
-
-    __cpuidex(cpuInfo, 7, 0);
-    return (cpuInfo[1] & (1 << 5)) != 0;
-#else
-    return false;
-#endif
-}
-
 // Function Prototypes
 static void SyncDCompState(HWND hwnd, float winW, float winH, bool animate);
 
@@ -5630,20 +5609,6 @@ int WINAPI wWinMain(HINSTANCE hInstance, HINSTANCE, LPWSTR lpCmdLine, int nCmdSh
         }
         g_isMasterProcess = (routeResult == QuickView::ProcessRouter::RouteResult::BecameMaster);
     }
-
-    // [v3.2.3] AVX2 Check - Critical: App compiled with /arch:AVX2, will crash without it
-    const bool hasAvx2 = IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE) || SupportsAvx2ByCpuid();
-    if (!hasAvx2) {
-        MessageBoxW(nullptr, 
-            L"QuickView requires a CPU with AVX2 support.\n\n"
-            L"Minimum Requirements:\n"
-            L"Intel: Core 4th Gen (Haswell, 2013) or later\n"
-            L"AMD: Ryzen (Zen, 2017) or later\n\n"
-            L"Your CPU does not support AVX2. The application cannot run.",
-            L"QuickView - Hardware Not Supported",
-            MB_OK | MB_ICONERROR);
-        return 1;
-    }
     
     AppStrings::Init();
 
diff --git a/fix.sh b/fix.sh
new file mode 100644
index 0000000..b31b53f
--- /dev/null
+++ b/fix.sh
@@ -0,0 +1 @@
+git add QuickView/ImageLoader.cpp QuickView/SIMDUtils.h QuickView/SettingsOverlay.cpp QuickView/main.cpp QuickView/QuickView.vcxproj
diff --git a/test_main b/test_main
new file mode 100755
index 0000000..6d6f3b2
Binary files /dev/null and b/test_main differ
diff --git a/test_main.cpp b/test_main.cpp
new file mode 100644
index 0000000..c25e100
--- /dev/null
+++ b/test_main.cpp
@@ -0,0 +1,30 @@
+#include "QuickView/SystemInfo.h"
+#ifndef HWY_TARGETS
+#if defined(_M_X64) || defined(__x86_64__)
+    #undef HWY_BASELINE_TARGETS
+    #define HWY_BASELINE_TARGETS (HWY_SSE4)
+#endif
+#endif
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace SIMD_ImageLoader {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+inline void ComputeHistRow(const uint8_t* row, int width, uint32_t* HistR, uint32_t* HistG, uint32_t* HistB, uint32_t* HistL, int& x_out) {
+}
+}
+}
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace SIMD_ImageLoader {
+    HWY_EXPORT(ComputeHistRow);
+}
+#endif
+
+int main() {
+    int x_out = 0;
+    HWY_DYNAMIC_DISPATCH(SIMD_ImageLoader::ComputeHistRow)(nullptr, 0, nullptr, nullptr, nullptr, nullptr, x_out);
+    return 0;
+}