diff --git a/CMakeLists.txt b/CMakeLists.txt index 260086ec..6a130e71 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,7 @@ function (add_arch_library NAME ARCH SRCS DEFS) target_set_arch(${NAME}_${ARCH} PRIVATE ${ARCH}) target_compile_options(${NAME}_${ARCH} PRIVATE ${DEFS}) target_link_libraries(${NAME}_all INTERFACE ${NAME}_${ARCH}) + target_compile_options(${NAME}_${ARCH} PRIVATE -flto) endfunction () if (ENABLE_DFT) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4c08a5be..5e75681b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -118,7 +118,7 @@ jobs: /bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer" brew install ninja - ci/run.sh build-release -DENABLE_CAPI_BUILD=ON -DUSE_SDE=ON -DARCH_TESTS=sse2,ssse3,sse41,avx,avx2,avx512 -DCMAKE_BUILD_TYPE=Release + ci/run.sh build-release -DENABLE_CAPI_BUILD=ON -DUSE_SDE=ON -DARCH_TESTS=sse2,ssse3,sse41,avx,avx2,avx512 -DCPU_ARCH=sse2 -DENABLE_DFT_MULTIARCH=ON -DCMAKE_BUILD_TYPE=Release - job: iOS_ARM_Clang_Release timeoutInMinutes: 120 @@ -217,7 +217,7 @@ jobs: set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=% set PATH=%PATH:C:\Strawberry\c\bin;=% set PATH=C:\sde;%PATH% - ci\run.cmd build-release -DENABLE_CAPI_BUILD=ON -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DUSE_SDE=ON -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release + ci\run.cmd build-release -DENABLE_CAPI_BUILD=ON -DARCH_TESTS=ON -DENABLE_DFT_MULTIARCH=ON -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DUSE_SDE=ON -DCPU_ARCH=sse2 -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release - job: Windows_MSVC_x86_AVX512_Clang9_Release timeoutInMinutes: 120 @@ -236,7 +236,7 @@ jobs: set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=% set PATH=%PATH:C:\Strawberry\c\bin;=% set PATH=C:\sde;%PATH% - ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DUSE_SDE=ON -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release + ci\run.cmd build-release -DENABLE_CAPI_BUILD=ON -DARCH_TESTS=ON -DENABLE_DFT_MULTIARCH=ON -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DUSE_SDE=ON -DCPU_ARCH=sse2 -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release - job: Windows_MSVC_x86_64_AVX512_MSVC2017_Release timeoutInMinutes: 120 diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp index 88242170..5cff8a59 100644 --- a/include/kfr/dft/impl/ft.hpp +++ b/include/kfr/dft/impl/ft.hpp @@ -483,7 +483,7 @@ constexpr KFR_INTRINSIC cvec fixed_twiddle(size_t size, size_t start, // constexpr cvec fixed_twiddle = get_fixed_twiddle(); template -constexpr KFR_INTRINSIC cvec twiddleimagmask() +constexpr static inline cvec twiddleimagmask() { return inverse ? broadcast(-1, +1) : broadcast(+1, -1); } @@ -1022,11 +1022,16 @@ KFR_INTRINSIC void apply_twiddles2(cvec& a1) } template -static const cvec tw3r1 = static_cast(-0.5 - 1.0); +static constexpr KFR_INTRINSIC cvec tw3r1() +{ + return static_cast(-0.5 - 1.0); +} template -static const cvec tw3i1 = - static_cast(0.86602540378443864676372317075) * twiddleimagmask(); +static constexpr KFR_INTRINSIC cvec tw3i1() +{ + return static_cast(0.86602540378443864676372317075) * twiddleimagmask(); +} template KFR_INTRINSIC void butterfly3(cvec a00, cvec a01, cvec a02, cvec& w00, @@ -1037,9 +1042,9 @@ KFR_INTRINSIC void butterfly3(cvec a00, cvec a01, cvec a02, cv const cvec dif1 = swap<2>(a01 - a02); w00 = a00 + sum1; - const cvec s1 = w00 + sum1 * tw3r1; + const cvec s1 = w00 + sum1 * tw3r1(); - const cvec d1 = dif1 * tw3i1; + const cvec d1 = dif1 * tw3i1(); w01 = s1 + d1; w02 = s1 - d1; @@ -1132,25 +1137,40 @@ KFR_INTRINSIC void butterfly9(cvec& a0, cvec& a1, cvec& a2, cv } template -static const cvec tw7r1 = static_cast(0.623489801858733530525004884 - 1.0); +static constexpr KFR_INTRINSIC cvec tw7r1() +{ + return static_cast(0.623489801858733530525004884 - 1.0); +} template -static const cvec tw7i1 = - static_cast(0.78183148246802980870844452667) * twiddleimagmask(); +static constexpr KFR_INTRINSIC cvec tw7i1() +{ + return static_cast(0.78183148246802980870844452667) * twiddleimagmask(); +} template -static const cvec tw7r2 = static_cast(-0.2225209339563144042889025645 - 1.0); +static constexpr KFR_INTRINSIC cvec tw7r2() +{ + return static_cast(-0.2225209339563144042889025645 - 1.0); +} template -static const cvec tw7i2 = - static_cast(0.97492791218182360701813168299) * twiddleimagmask(); +static constexpr KFR_INTRINSIC cvec tw7i2() +{ + return static_cast(0.97492791218182360701813168299) * twiddleimagmask(); +} template -static const cvec tw7r3 = static_cast(-0.90096886790241912623610231951 - 1.0); +static constexpr KFR_INTRINSIC cvec tw7r3() +{ + return static_cast(-0.90096886790241912623610231951 - 1.0); +} template -static const cvec tw7i3 = - static_cast(0.43388373911755812047576833285) * twiddleimagmask(); +static constexpr KFR_INTRINSIC cvec tw7i3() +{ + return static_cast(0.43388373911755812047576833285) * twiddleimagmask(); +} template KFR_INTRINSIC void butterfly7(cvec a00, cvec a01, cvec a02, cvec a03, cvec a04, @@ -1167,18 +1187,18 @@ KFR_INTRINSIC void butterfly7(cvec a00, cvec a01, cvec a02, cv w00 = a00 + sum1 + sum2 + sum3; const cvec s1 = - w00 + sum1 * tw7r1 + sum2 * tw7r2 + sum3 * tw7r3; + w00 + sum1 * tw7r1() + sum2 * tw7r2() + sum3 * tw7r3(); const cvec s2 = - w00 + sum1 * tw7r2 + sum2 * tw7r3 + sum3 * tw7r1; + w00 + sum1 * tw7r2() + sum2 * tw7r3() + sum3 * tw7r1(); const cvec s3 = - w00 + sum1 * tw7r3 + sum2 * tw7r1 + sum3 * tw7r2; + w00 + sum1 * tw7r3() + sum2 * tw7r1() + sum3 * tw7r2(); const cvec d1 = - dif1 * tw7i1 + dif2 * tw7i2 + dif3 * tw7i3; + dif1 * tw7i1() + dif2 * tw7i2() + dif3 * tw7i3(); const cvec d2 = - dif1 * tw7i2 - dif2 * tw7i3 - dif3 * tw7i1; + dif1 * tw7i2() - dif2 * tw7i3()- dif3 * tw7i1(); const cvec d3 = - dif1 * tw7i3 - dif2 * tw7i1 + dif3 * tw7i2; + dif1 * tw7i3() - dif2 * tw7i1() + dif3 * tw7i2(); w01 = s1 + d1; w06 = s1 - d1; @@ -1294,15 +1314,25 @@ KFR_INTRINSIC void butterfly11(cvec a00, cvec a01, cvec a02, c } template -const static cvec tw5r1 = static_cast(0.30901699437494742410229341718 - 1.0); +static constexpr KFR_INTRINSIC cvec tw5r1() +{ + return static_cast(0.30901699437494742410229341718 - 1.0); +} template -const static cvec tw5i1 = - static_cast(0.95105651629515357211643933338) * twiddleimagmask(); +static constexpr KFR_INTRINSIC cvec tw5i1() +{ + return static_cast(0.95105651629515357211643933338) * twiddleimagmask(); +} template -const static cvec tw5r2 = static_cast(-0.80901699437494742410229341718 - 1.0); +static constexpr KFR_INTRINSIC cvec tw5r2() +{ + return static_cast(-0.80901699437494742410229341718 - 1.0); +} template -const static cvec tw5i2 = - static_cast(0.58778525229247312916870595464) * twiddleimagmask(); +static constexpr KFR_INTRINSIC cvec tw5i2() +{ + return static_cast(0.58778525229247312916870595464) * twiddleimagmask(); +} template KFR_INTRINSIC void butterfly5(const cvec& a00, const cvec& a01, const cvec& a02, @@ -1315,11 +1345,11 @@ KFR_INTRINSIC void butterfly5(const cvec& a00, const cvec& a01, cons const cvec dif2 = swap<2>(a02 - a03); w00 = a00 + sum1 + sum2; - const cvec s1 = w00 + sum1 * tw5r1 + sum2 * tw5r2; - const cvec s2 = w00 + sum1 * tw5r2 + sum2 * tw5r1; + const cvec s1 = w00 + sum1 * tw5r1() + sum2 * tw5r2(); + const cvec s2 = w00 + sum1 * tw5r2() + sum2 * tw5r1(); - const cvec d1 = dif1 * tw5i1 + dif2 * tw5i2; - const cvec d2 = dif1 * tw5i2 - dif2 * tw5i1; + const cvec d1 = dif1 * tw5i1() + dif2 * tw5i2(); + const cvec d2 = dif1 * tw5i2() - dif2 * tw5i1(); w01 = s1 + d1; w04 = s1 - d1; @@ -1690,16 +1720,15 @@ template > KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t, complex* out, const complex* in, complex*, const complex* twiddle, Tstride ostride = {}) { - cswitch( - csizes_t<11, 13>(), radix, - [&](auto radix_) CMT_INLINE_LAMBDA { - constexpr size_t width = vector_width; - spec_generic_butterfly_w(radix_, cbool_t(), out, in, twiddle, ostride); - }, - [&]() CMT_INLINE_LAMBDA { - constexpr size_t width = vector_width; - generic_butterfly_w(radix, cbool_t(), out, in, twiddle, ostride); - }); + cswitch(csizes_t<11, 13>(), radix, + [&](auto radix_) CMT_INLINE_LAMBDA { + constexpr size_t width = vector_width; + spec_generic_butterfly_w(radix_, cbool_t(), out, in, twiddle, ostride); + }, + [&]() CMT_INLINE_LAMBDA { + constexpr size_t width = vector_width; + generic_butterfly_w(radix, cbool_t(), out, in, twiddle, ostride); + }); } template diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4a7cdd75..bf3f550d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -253,8 +253,7 @@ if (NOT SKIP_TESTS) ${PROJECT_BINARY_DIR}/bin/all_tests_${A}) endif () endforeach () - else () - add_test(NAME all_tests COMMAND ${EMULATOR} - ${PROJECT_BINARY_DIR}/bin/all_tests) endif () + add_test(NAME all_tests COMMAND ${SDE} ${SDE_ARCH_${CPU_ARCH}} -chip_check_exe_only + -- ${PROJECT_BINARY_DIR}/bin/all_tests) endif ()