From 48314f429b824eccf07c3910898ea7b92fc85d32 Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 8 Jan 2020 14:44:27 -0700 Subject: [PATCH 1/2] enable make format using clang format, enable it on travis. --- .clang-format | 10 ++++++ .travis.yml | 3 +- CMakeLists.txt | 16 ++++++++-- cmake/FindCLANG_FORMAT.cmake | 60 ++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 .clang-format create mode 100644 cmake/FindCLANG_FORMAT.cmake diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..b8c0e2ce --- /dev/null +++ b/.clang-format @@ -0,0 +1,10 @@ +BasedOnStyle: LLVM +--- +Language: Cpp +AlwaysBreakTemplateDeclarations: true +BreakBeforeBraces: Allman +BinPackParameters: true +IndentWidth: 4 +SpacesInParentheses: true +BreakConstructorInitializersBeforeComma: true +PointerAlignment: Left diff --git a/.travis.yml b/.travis.yml index 40352990..67a4e0b5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,7 +38,8 @@ env: #maybe add mpich later script: - mkdir build && cd build && PATH="$HOME/bin:/usr/lib/ccache:$PATH" CC=gcc-${GVER} CXX=g++-${GVER} cmake -DENABLE_INTEGRATED_TESTS=ON -DENABLE_UNIT_TESTS=ON ${USE_V4_SSE:+-DUSE_V4_SSE=ON} ${USE_V4_PORTABLE:+-DUSE_V4_PORTABLE=ON} ${LONG_TESTS:+-DENABLE_LONG_TESTS=ON} ${COVERAGE:+-DENABLE_COVERAGE_BUILD=ON} .. && - make -j4 VERBOSE=1 && make test CTEST_OUTPUT_ON_FAILURE=1 && make install DESTDIR=$PWD + #make -j4 VERBOSE=1 && make test CTEST_OUTPUT_ON_FAILURE=1 && make install DESTDIR=$PWD + make -j4 VERBOSE=1 && make test CTEST_OUTPUT_ON_FAILURE=1 && make format && git diff --exit-code && make install DESTDIR=$PWD after_success: - if [[ ${COVERAGE} ]]; then cd .. && codecov --gcov-exec gcov-${GVER}; fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 2724a9ff..b42d78f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,8 @@ endif() set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + #------------------------------------------------------------------------------# # Set C flags #------------------------------------------------------------------------------# @@ -361,6 +363,7 @@ file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VPIC_CXX_FLAGS}") file(GLOB_RECURSE VPIC_SRC src/*.c src/*.cc) +file(GLOB_RECURSE VPIC_HEADER src/*.h) file(GLOB_RECURSE VPIC_NOT_SRC src/util/v4/test/v4.cc src/util/v8/test/v8.cc @@ -438,6 +441,13 @@ if(ENABLE_PERFORMANCE_TESTS) include_directories(${CATCH_DIR}) add_subdirectory(test/performance) endif(ENABLE_PERFORMANCE_TESTS) -#~---------------------------------------------------------------------------~-# -# vim: set tabstop=2 shiftwidth=2 expandtab : -#~---------------------------------------------------------------------------~-# + +# enable "make format" and clang-format +find_package(CLANG_FORMAT) +if(CLANG_FORMAT_FOUND) + #file(GLOB_RECURSE FORMAT_SOURCES core/*.cpp core/*.hpp) + set(FORMAT_SOURCES ${VPIC_SRC} ${VPIC_HEADER}) + add_custom_target(format + COMMAND ${CLANG_FORMAT_EXECUTABLE} -i -style=file ${FORMAT_SOURCES} + DEPENDS ${FORMAT_SOURCES}) +endif() diff --git a/cmake/FindCLANG_FORMAT.cmake b/cmake/FindCLANG_FORMAT.cmake new file mode 100644 index 00000000..c4f42293 --- /dev/null +++ b/cmake/FindCLANG_FORMAT.cmake @@ -0,0 +1,60 @@ +############################################################################ +# Copyright (c) 2019 by the Cabana authors # +# All rights reserved. # +# # +# This file is part of the Cabana library. Cabana is distributed under a # +# BSD 3-clause license. For the licensing terms see the LICENSE file in # +# the top-level directory. # +# # +# SPDX-License-Identifier: BSD-3-Clause # +############################################################################ +# +# Find clang-format +# +# CLANG_FORMAT_EXECUTABLE - Path to clang-format executable +# CLANG_FORMAT_FOUND - True if the clang-format executable was found. +# CLANG_FORMAT_VERSION - The version of clang-format found +# + +find_program(CLANG_FORMAT_EXECUTABLE + NAMES clang-format + clang-format-7 + clang-format-6.0 + clang-format-5.0 + clang-format-4.0 + clang-format-3.9 + clang-format-3.8 + clang-format-3.7 + clang-format-3.6 + clang-format-3.5 + clang-format-3.4 + clang-format-3.3 + DOC "clang-format executable") +mark_as_advanced(CLANG_FORMAT_EXECUTABLE) + +# Extract version from command "clang-format -version" +if(CLANG_FORMAT_EXECUTABLE) + execute_process(COMMAND ${CLANG_FORMAT_EXECUTABLE} -version + OUTPUT_VARIABLE clang_format_version + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(clang_format_version MATCHES "^clang-format version .*") + # clang_format_version sample: "clang-format version 3.9.1-4ubuntu3~16.04.1 + # (tags/RELEASE_391/rc2)" + string(REGEX + REPLACE "clang-format version ([.0-9]+).*" + "\\1" + CLANG_FORMAT_VERSION + "${clang_format_version}") + # CLANG_FORMAT_VERSION sample: "3.9.1" + else() + set(CLANG_FORMAT_VERSION 0.0) + endif() +else() + set(CLANG_FORMAT_VERSION 0.0) +endif() + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set CLANG_FORMAT_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(CLANG_FORMAT REQUIRED_VARS CLANG_FORMAT_EXECUTABLE VERSION_VAR CLANG_FORMAT_VERSION) From 214030d0511e6c033de5d4b3770983227e51a1ca Mon Sep 17 00:00:00 2001 From: Robert Bird Date: Wed, 8 Jan 2020 14:44:36 -0700 Subject: [PATCH 2/2] Apply clang format code wide --- src/boundary/boundary.h | 52 +- src/boundary/boundary_private.h | 68 +- src/collision/binary.h | 23 +- src/collision/collision.h | 194 +- src/collision/collision_private.h | 47 +- src/collision/langevin.h | 13 +- src/collision/pipeline/collision_pipeline.h | 26 +- src/collision/takizuka_abe.h | 18 +- src/collision/unary.h | 19 +- src/emitter/emitter.h | 57 +- src/emitter/emitter_private.h | 49 +- src/field_advance/field_advance.h | 102 +- src/field_advance/field_advance_private.h | 6 +- .../standard/pipeline/advance_b_pipeline.h | 109 +- .../standard/pipeline/advance_e_pipeline.h | 138 +- .../standard/pipeline/clean_div_b_pipeline.h | 39 +- .../standard/pipeline/clean_div_e_pipeline.h | 107 +- .../pipeline/compute_curl_b_pipeline.h | 122 +- .../pipeline/compute_div_b_err_pipeline.h | 15 +- .../pipeline/compute_div_e_err_pipeline.h | 100 +- .../standard/pipeline/compute_rhob_pipeline.h | 98 +- .../pipeline/compute_rms_div_b_err_pipeline.h | 18 +- .../pipeline/compute_rms_div_e_err_pipeline.h | 18 +- .../standard/pipeline/energy_f_pipeline.h | 118 +- .../pipeline/vacuum_advance_e_pipeline.h | 150 +- .../pipeline/vacuum_clean_div_e_pipeline.h | 93 +- .../pipeline/vacuum_compute_curl_b_pipeline.h | 122 +- .../vacuum_compute_div_e_err_pipeline.h | 100 +- .../pipeline/vacuum_compute_rhob_pipeline.h | 95 +- .../pipeline/vacuum_energy_f_pipeline.h | 127 +- src/field_advance/standard/sfa_private.h | 233 +- src/grid/grid.h | 399 +- src/material/material.h | 55 +- .../pipeline/sf_interface_pipeline.h | 105 +- src/sf_interface/sf_interface.h | 124 +- src/sf_interface/sf_interface_private.h | 22 +- src/species_advance/species_advance.h | 119 +- src/species_advance/species_advance_aos.h | 120 +- .../standard/pipeline/spa_private.h | 274 +- src/util/bitfield.h | 173 +- src/util/checkpt/checkpt.h | 180 +- src/util/checkpt/checkpt_io.h | 96 +- src/util/checkpt/checkpt_private.h | 19 +- src/util/checksum.h | 72 +- src/util/io/FileIO.h | 113 +- src/util/io/FileIOData.h | 36 +- src/util/io/FileUtils.h | 36 +- src/util/io/P2PIOPolicy.h | 1055 +-- src/util/io/P2PUtilsPolicy.h | 64 +- src/util/io/StandardIOPolicy.h | 239 +- src/util/io/StandardUtilsPolicy.h | 50 +- src/util/mp/DMPPolicy.h | 647 +- src/util/mp/MPWrapper.h | 25 +- src/util/mp/RelayPolicy.h | 723 +- src/util/mp/mp.h | 133 +- src/util/pipelines/pipelines.h | 13 +- src/util/pipelines/pipelines_exec.h | 8 +- src/util/pipelines/pipelines_exec_omp.h | 90 +- src/util/pipelines/pipelines_exec_pth.h | 42 +- src/util/pipelines/pipelines_openmp.h | 18 +- src/util/pipelines/pipelines_pthreads.h | 95 +- src/util/profile/profile.h | 123 +- src/util/rng/drandn_table.h | 4 +- src/util/rng/frandn_table.h | 2 +- src/util/rng/rng.h | 190 +- src/util/rng/rng_private.h | 521 +- src/util/swap.h | 311 +- src/util/system.h | 91 +- src/util/util.h | 18 +- src/util/util_base.h | 282 +- src/util/v16/v16.h | 10 +- src/util/v16/v16_avx512.h | 5702 +++++++----- src/util/v16/v16_portable.h | 7746 +++++++++-------- src/util/v16/v16_portable_v0.h | 7746 +++++++++-------- src/util/v16/v16_portable_v1.h | 6489 +++++++------- src/util/v4/v4.h | 26 +- src/util/v4/v4_altivec.h | 1706 ++-- src/util/v4/v4_avx.h | 1610 ++-- src/util/v4/v4_avx2.h | 1546 ++-- src/util/v4/v4_neon.h | 1700 ++-- src/util/v4/v4_portable.h | 1712 ++-- src/util/v4/v4_portable_v0.h | 1712 ++-- src/util/v4/v4_portable_v1.h | 1705 ++-- src/util/v4/v4_sse.h | 1520 ++-- src/util/v8/v8.h | 14 +- src/util/v8/v8_avx.h | 2008 ++--- src/util/v8/v8_avx2.h | 2040 ++--- src/util/v8/v8_portable.h | 2565 +++--- src/util/v8/v8_portable_v0.h | 2565 +++--- src/util/v8/v8_portable_v1.h | 2362 ++--- src/vpic/dumpmacros.h | 331 +- src/vpic/vpic.h | 1246 +-- src/vpic/vpic_unit_deck.h | 6 +- 93 files changed, 32409 insertions(+), 30821 deletions(-) diff --git a/src/boundary/boundary.h b/src/boundary/boundary.h index e19f65f3..850dce98 100644 --- a/src/boundary/boundary.h +++ b/src/boundary/boundary.h @@ -7,59 +7,47 @@ struct particle_bc; typedef struct particle_bc particle_bc_t; // may have been moved by Kevin -typedef struct link_boundary { -char fbase[256]; // base of file name to contain link info -double n_out; // number of writes so far on this node (double to - // accomodate long long runs) +typedef struct link_boundary +{ + char fbase[256]; // base of file name to contain link info + double n_out; // number of writes so far on this node (double to + // accomodate long long runs) } link_boundary_t; BEGIN_C_DECLS /* In boundary.c */ -int -num_particle_bc( const particle_bc_t * RESTRICT pbc_list ); +int num_particle_bc( const particle_bc_t* RESTRICT pbc_list ); -void -delete_particle_bc_list( particle_bc_t * RESTRICT pbc_list ); +void delete_particle_bc_list( particle_bc_t* RESTRICT pbc_list ); -particle_bc_t * -append_particle_bc( particle_bc_t * pbc, - particle_bc_t ** pbc_list ); +particle_bc_t* append_particle_bc( particle_bc_t* pbc, + particle_bc_t** pbc_list ); -int64_t -get_particle_bc_id( particle_bc_t * pbc ); +int64_t get_particle_bc_id( particle_bc_t* pbc ); /* In boundary_p.cxx */ -void -boundary_p( particle_bc_t * RESTRICT pbc_list, - species_t * RESTRICT sp_list, - field_array_t * RESTRICT fa, - accumulator_array_t * RESTRICT aa ); +void boundary_p( particle_bc_t* RESTRICT pbc_list, species_t* RESTRICT sp_list, + field_array_t* RESTRICT fa, accumulator_array_t* RESTRICT aa ); /* In maxwellian_reflux.c */ -particle_bc_t * -maxwellian_reflux( species_t * RESTRICT sp_list, - rng_pool_t * RESTRICT rp ); +particle_bc_t* maxwellian_reflux( species_t* RESTRICT sp_list, + rng_pool_t* RESTRICT rp ); -void -set_reflux_temp( /**/ particle_bc_t * RESTRICT mr, - const species_t * RESTRICT sp, - float ut_para, - float ut_perp ); +void set_reflux_temp( /**/ particle_bc_t* RESTRICT mr, + const species_t* RESTRICT sp, float ut_para, + float ut_perp ); /* In absorb_tally.c */ -particle_bc_t * -absorb_tally( /**/ species_t * RESTRICT sp_list, - const field_array_t * RESTRICT fa ); +particle_bc_t* absorb_tally( /**/ species_t* RESTRICT sp_list, + const field_array_t* RESTRICT fa ); -int * -get_absorb_tally( particle_bc_t * pbc ); +int* get_absorb_tally( particle_bc_t* pbc ); END_C_DECLS #endif /* _boundary_h_ */ - diff --git a/src/boundary/boundary_private.h b/src/boundary/boundary_private.h index 7d54f439..94c029f1 100644 --- a/src/boundary/boundary_private.h +++ b/src/boundary/boundary_private.h @@ -14,53 +14,53 @@ of these handlers update rhob according to net charge added and removed from the simulation by these functions. */ -typedef int /* Number of particles injected */ -(*particle_bc_func_t)( /* The boundary whose ... */ - void * RESTRICT b, /* parameters are b was hit by ... */ - species_t * RESTRICT sp, /* a particle from this species ... */ - particle_t * RESTRICT p, /* this particle in fact - (position is hit location, momentum - is at time of the hit) ... */ - particle_mover_t * RESTRICT pm, /* who had this much displacement - remaining when it hit */ - particle_injector_t * RESTRICT pi, /* Injectors for particles created by +typedef int /* Number of particles injected */ + ( *particle_bc_func_t )( /* The boundary whose ... */ + void* RESTRICT + b, /* parameters are b was hit by ... */ + species_t* RESTRICT + sp, /* a particle from this species ... */ + particle_t* RESTRICT + p, /* this particle in fact + (position is hit location, momentum + is at time of the hit) ... */ + particle_mover_t* RESTRICT + pm, /* who had this much displacement + remaining when it hit */ + particle_injector_t* RESTRICT + pi, /* Injectors for particles created by the interaction */ - int max_pi, /* Max number injections allowed */ - int face ); /* CONVENIENCE: Which face of the + int max_pi, /* Max number injections allowed */ + int face ); /* CONVENIENCE: Which face of the the voxel containing the above particle was hit */ -typedef void -(*delete_particle_bc_func_t)( particle_bc_t * RESTRICT pbc ); +typedef void ( *delete_particle_bc_func_t )( particle_bc_t* RESTRICT pbc ); -struct particle_bc { - void * params; - particle_bc_func_t interact; - delete_particle_bc_func_t delete_pbc; - int64_t id; - particle_bc_t * next; +struct particle_bc +{ + void* params; + particle_bc_func_t interact; + delete_particle_bc_func_t delete_pbc; + int64_t id; + particle_bc_t* next; }; BEGIN_C_DECLS -void -checkpt_particle_bc_internal( const particle_bc_t * pbc ); +void checkpt_particle_bc_internal( const particle_bc_t* pbc ); -particle_bc_t * -restore_particle_bc_internal( void * params ); +particle_bc_t* restore_particle_bc_internal( void* params ); -particle_bc_t * -new_particle_bc_internal( void * params, - particle_bc_func_t interact, - delete_particle_bc_func_t delete_pbc, - checkpt_func_t checkpt, - restore_func_t restore, - reanimate_func_t reanimate ); +particle_bc_t* new_particle_bc_internal( void* params, + particle_bc_func_t interact, + delete_particle_bc_func_t delete_pbc, + checkpt_func_t checkpt, + restore_func_t restore, + reanimate_func_t reanimate ); -void -delete_particle_bc_internal( particle_bc_t * pbc ); +void delete_particle_bc_internal( particle_bc_t* pbc ); END_C_DECLS #endif /* _boundary_h_ */ - diff --git a/src/collision/binary.h b/src/collision/binary.h index aa16dd37..52e0da93 100644 --- a/src/collision/binary.h +++ b/src/collision/binary.h @@ -5,19 +5,18 @@ typedef struct binary_collision_model { - char * name; - binary_rate_constant_func_t rate_constant; - binary_collision_func_t collision; - void * params; - species_t * spi; - species_t * spj; - rng_pool_t * rp; - double sample; - int interval; - int n_large_pr[ MAX_PIPELINE ]; + char* name; + binary_rate_constant_func_t rate_constant; + binary_collision_func_t collision; + void* params; + species_t* spi; + species_t* spj; + rng_pool_t* rp; + double sample; + int interval; + int n_large_pr[MAX_PIPELINE]; } binary_collision_model_t; -void -apply_binary_collision_model_pipeline( binary_collision_model_t * cm ); +void apply_binary_collision_model_pipeline( binary_collision_model_t* cm ); #endif /* _binary_h_ */ diff --git a/src/collision/collision.h b/src/collision/collision.h index 4a79cbd2..cf20512d 100644 --- a/src/collision/collision.h +++ b/src/collision/collision.h @@ -9,7 +9,7 @@ #include #ifndef M_PI -# define M_PI 3.14159265358979323846 +#define M_PI 3.14159265358979323846 #endif struct collision_op; @@ -19,18 +19,14 @@ BEGIN_C_DECLS /* In collision.c */ -int -num_collision_op( const collision_op_t * RESTRICT cop_list ); +int num_collision_op( const collision_op_t* RESTRICT cop_list ); -void -apply_collision_op_list( collision_op_t * RESTRICT cop_list ); +void apply_collision_op_list( collision_op_t* RESTRICT cop_list ); -void -delete_collision_op_list( collision_op_t * RESTRICT cop_list ); +void delete_collision_op_list( collision_op_t* RESTRICT cop_list ); -collision_op_t * -append_collision_op( collision_op_t * cop, - collision_op_t ** cop_list ); +collision_op_t* append_collision_op( collision_op_t* cop, + collision_op_t** cop_list ); /* In takizuka_abe.c */ @@ -52,13 +48,11 @@ append_collision_op( collision_op_t * cop, pass us the base cvar */ -collision_op_t * -takizuka_abe( const char * RESTRICT name, - /**/ species_t * RESTRICT spi, - /**/ species_t * RESTRICT spj, - /**/ rng_pool_t * RESTRICT rp, - const double cvar0, - const int interval ); +collision_op_t* takizuka_abe( const char* RESTRICT name, + /**/ species_t* RESTRICT spi, + /**/ species_t* RESTRICT spj, + /**/ rng_pool_t* RESTRICT rp, const double cvar0, + const int interval ); /* In langevin.c */ @@ -80,12 +74,8 @@ takizuka_abe( const char * RESTRICT name, every interval time step to all the particle momenta in the species. Above, dW is a basic Weiner process. */ -collision_op_t * -langevin( float kT, - float nu, - species_t * RESTRICT sp, - rng_pool_t * RESTRICT rp, - int interval ); +collision_op_t* langevin( float kT, float nu, species_t* RESTRICT sp, + rng_pool_t* RESTRICT rp, int interval ); /* In unary.c */ @@ -116,10 +106,9 @@ langevin( float kT, return vi sigma(vi) n_background; } */ -typedef float -(*unary_rate_constant_func_t)( /**/ void * RESTRICT params, - const species_t * RESTRICT sp, - const particle_t * RESTRICT ALIGNED(32) p ); +typedef float ( *unary_rate_constant_func_t )( + /**/ void* RESTRICT params, const species_t* RESTRICT sp, + const particle_t* RESTRICT ALIGNED( 32 ) p ); /* A unary_collision_func_t implements the microscopic physics of a collision between a particle and some background whose properties @@ -137,24 +126,22 @@ typedef float pi->u{xyz} } */ -typedef void -(*unary_collision_func_t)( /**/ void * RESTRICT params, - const species_t * RESTRICT sp, - /**/ particle_t * RESTRICT ALIGNED(32) p, - /**/ rng_t * RESTRICT rng ); +typedef void ( *unary_collision_func_t )( + /**/ void* RESTRICT params, const species_t* RESTRICT sp, + /**/ particle_t* RESTRICT ALIGNED( 32 ) p, + /**/ rng_t* RESTRICT rng ); /* Declare a unary collision model with the given microscopic physics. params must be a registered object or NULL. Every particle is tested for collision on every "interval" timesteps. */ -collision_op_t * -unary_collision_model( const char * RESTRICT name, - unary_rate_constant_func_t rate_constant, - unary_collision_func_t collision, - /**/ void * RESTRICT params, - /**/ species_t * RESTRICT sp, - /**/ rng_pool_t * RESTRICT rp, - int interval ); +collision_op_t* unary_collision_model( const char* RESTRICT name, + unary_rate_constant_func_t rate_constant, + unary_collision_func_t collision, + /**/ void* RESTRICT params, + /**/ species_t* RESTRICT sp, + /**/ rng_pool_t* RESTRICT rp, + int interval ); /* In binary.c */ @@ -219,12 +206,10 @@ unary_collision_model( const char * RESTRICT name, return vr sigma(vr); } */ -typedef float -(*binary_rate_constant_func_t)( /**/ void * RESTRICT params, - const species_t * RESTRICT spi, - const species_t * RESTRICT spj, - const particle_t * RESTRICT ALIGNED(32) pi, - const particle_t * RESTRICT ALIGNED(32) pj ); +typedef float ( *binary_rate_constant_func_t )( + /**/ void* RESTRICT params, const species_t* RESTRICT spi, + const species_t* RESTRICT spj, const particle_t* RESTRICT ALIGNED( 32 ) pi, + const particle_t* RESTRICT ALIGNED( 32 ) pj ); /* A binary_collision_func_t implements the microscopic physics of a collision between two particles, pi and pj. The basic control @@ -248,90 +233,83 @@ typedef float if( type & 2 ) pj->u{xyz} = uj{xyz}; } */ -typedef void -(*binary_collision_func_t)( /**/ void * RESTRICT params, - const species_t * RESTRICT spi, - const species_t * RESTRICT spj, - /**/ particle_t * RESTRICT ALIGNED(32) pi, - /**/ particle_t * RESTRICT ALIGNED(32) pj, - /**/ rng_t * RESTRICT rng, - int type ); +typedef void ( *binary_collision_func_t )( + /**/ void* RESTRICT params, const species_t* RESTRICT spi, + const species_t* RESTRICT spj, + /**/ particle_t* RESTRICT ALIGNED( 32 ) pi, + /**/ particle_t* RESTRICT ALIGNED( 32 ) pj, + /**/ rng_t* RESTRICT rng, int type ); /* Declare a binary collision model with the given microscopic physics. params must be a registered object or NULL. A particle in a species will be tested for collision on average at least "sample" times every "interval" timesteps. */ -collision_op_t * -binary_collision_model( const char * RESTRICT name, - binary_rate_constant_func_t rate_constant, - binary_collision_func_t collision, - /**/ void * RESTRICT params, - /**/ species_t * RESTRICT spi, - /**/ species_t * RESTRICT spj, - /**/ rng_pool_t * RESTRICT rp, - double sample, - int interval ); +collision_op_t* binary_collision_model( + const char* RESTRICT name, binary_rate_constant_func_t rate_constant, + binary_collision_func_t collision, + /**/ void* RESTRICT params, + /**/ species_t* RESTRICT spi, + /**/ species_t* RESTRICT spj, + /**/ rng_pool_t* RESTRICT rp, double sample, int interval ); /* In hard_sphere.c */ /* Based on unary_collision_model */ -collision_op_t * -hard_sphere_fluid( const char * RESTRICT name, /* Model name */ - const float n0, /* Fluid density (#/VOLUME) */ - const float v0x, /* Fluid x-drift (VELOCITY) */ - const float v0y, /* Fluid y-drift (VELOCITY) */ - const float v0z, /* Fluid z-drift (VELOCITY) */ - const float kT0, /* Fluid temperature (ENERGY) */ - const float m0, /* Fluid p. mass (MASS) */ - const float r0, /* Fluid p. radius (LENGTH) */ - species_t * RESTRICT sp, /* Species */ - const float rsp, /* Species p. radius (LENGTH) */ - rng_pool_t * RESTRICT rp, /* Entropy pool */ - const int interval ); /* How often to apply this */ +collision_op_t* +hard_sphere_fluid( const char* RESTRICT name, /* Model name */ + const float n0, /* Fluid density (#/VOLUME) */ + const float v0x, /* Fluid x-drift (VELOCITY) */ + const float v0y, /* Fluid y-drift (VELOCITY) */ + const float v0z, /* Fluid z-drift (VELOCITY) */ + const float kT0, /* Fluid temperature (ENERGY) */ + const float m0, /* Fluid p. mass (MASS) */ + const float r0, /* Fluid p. radius (LENGTH) */ + species_t* RESTRICT sp, /* Species */ + const float rsp, /* Species p. radius (LENGTH) */ + rng_pool_t* RESTRICT rp, /* Entropy pool */ + const int interval ); /* How often to apply this */ /* Based on binary_collision_model */ -collision_op_t * -hard_sphere( const char * RESTRICT name, /* Model name */ - species_t * RESTRICT spi, /* Species-i */ - const float ri, /* Species-i p. radius (LENGTH) */ - species_t * RESTRICT spj, /* Species-j */ - const float rj, /* Species-j p. radius (LENGTH) */ - rng_pool_t * RESTRICT rp, /* Entropy pool */ - const double sample, /* Sampling density */ - const int interval ); /* How often to apply this */ +collision_op_t* hard_sphere( const char* RESTRICT name, /* Model name */ + species_t* RESTRICT spi, /* Species-i */ + const float ri, /* Species-i p. radius (LENGTH) */ + species_t* RESTRICT spj, /* Species-j */ + const float rj, /* Species-j p. radius (LENGTH) */ + rng_pool_t* RESTRICT rp, /* Entropy pool */ + const double sample, /* Sampling density */ + const int interval ); /* How often to apply this */ /* In large_angle_coulomb.c */ /* Based on unary_collision_model */ -collision_op_t * -large_angle_coulomb_fluid( - const char * RESTRICT name, /* Model name */ - const float n0, /* Fluid density (#/VOLUME) */ - const float vdx, /* Fluid x-drift (VELOCITY) */ - const float vdy, /* Fluid y-drift (VELOCITY) */ - const float vdz, /* Fluid z-drift (VELOCITY) */ - const float kT0, /* Fluid temperature (ENERGY) */ - const float q0, /* Fluid particle charge (CHARGE) */ - const float m0, /* Fluid particle mass (MASS) */ - species_t * RESTRICT sp, /* Species */ - const float bmax, /* Impact parameter cutoff */ - rng_pool_t * RESTRICT rp, /* Entropy pool */ - const int interval ); /* How often to apply this */ +collision_op_t* +large_angle_coulomb_fluid( const char* RESTRICT name, /* Model name */ + const float n0, /* Fluid density (#/VOLUME) */ + const float vdx, /* Fluid x-drift (VELOCITY) */ + const float vdy, /* Fluid y-drift (VELOCITY) */ + const float vdz, /* Fluid z-drift (VELOCITY) */ + const float kT0, /* Fluid temperature (ENERGY) */ + const float q0, /* Fluid particle charge (CHARGE) */ + const float m0, /* Fluid particle mass (MASS) */ + species_t* RESTRICT sp, /* Species */ + const float bmax, /* Impact parameter cutoff */ + rng_pool_t* RESTRICT rp, /* Entropy pool */ + const int interval ); /* How often to apply this */ /* Based on binary_collision_model */ -collision_op_t * -large_angle_coulomb( const char * RESTRICT name, /* Model name */ - species_t * RESTRICT spi, /* Species-i */ - species_t * RESTRICT spj, /* Species-j */ - const float bmax, /* Impact parameter cutoff */ - rng_pool_t * RESTRICT rp, /* Entropy pool */ - const double sample, /* Sampling density */ - const int interval ); /* How often to apply this */ +collision_op_t* +large_angle_coulomb( const char* RESTRICT name, /* Model name */ + species_t* RESTRICT spi, /* Species-i */ + species_t* RESTRICT spj, /* Species-j */ + const float bmax, /* Impact parameter cutoff */ + rng_pool_t* RESTRICT rp, /* Entropy pool */ + const double sample, /* Sampling density */ + const int interval ); /* How often to apply this */ END_C_DECLS diff --git a/src/collision/collision_private.h b/src/collision/collision_private.h index 2bbef1b6..278bf739 100644 --- a/src/collision/collision_private.h +++ b/src/collision/collision_private.h @@ -7,37 +7,31 @@ #include "collision.h" -typedef void -(*collision_op_func_t)( void * params ); +typedef void ( *collision_op_func_t )( void* params ); -typedef void -(*delete_collision_op_func_t) ( struct collision_op * cop ); +typedef void ( *delete_collision_op_func_t )( struct collision_op* cop ); -struct collision_op { - void * params; - collision_op_func_t apply; - delete_collision_op_func_t delete_cop; - collision_op_t * next; +struct collision_op +{ + void* params; + collision_op_func_t apply; + delete_collision_op_func_t delete_cop; + collision_op_t* next; }; BEGIN_C_DECLS -void -checkpt_collision_op_internal( const collision_op_t * cop ); +void checkpt_collision_op_internal( const collision_op_t* cop ); -collision_op_t * -restore_collision_op_internal( void * params ); +collision_op_t* restore_collision_op_internal( void* params ); -collision_op_t * -new_collision_op_internal( void * params, - collision_op_func_t apply, +collision_op_t* +new_collision_op_internal( void* params, collision_op_func_t apply, delete_collision_op_func_t delete_cop, - checkpt_func_t checkpt, - restore_func_t restore, + checkpt_func_t checkpt, restore_func_t restore, reanimate_func_t reanimate ); -void -delete_collision_op_internal( collision_op_t * cop ); +void delete_collision_op_internal( collision_op_t* cop ); END_C_DECLS @@ -46,12 +40,13 @@ END_C_DECLS typedef struct langevin_pipeline_args { - MEM_PTR( particle_t, 128 ) p; - MEM_PTR( rng_t, 128 ) rng[ MAX_PIPELINE ]; - float decay; - float drive; - int np; - PAD_STRUCT( (1+MAX_PIPELINE)*SIZEOF_MEM_PTR+2*sizeof(float)+sizeof(int) ) + MEM_PTR( particle_t, 128 ) p; + MEM_PTR( rng_t, 128 ) rng[MAX_PIPELINE]; + float decay; + float drive; + int np; + PAD_STRUCT( ( 1 + MAX_PIPELINE ) * SIZEOF_MEM_PTR + 2 * sizeof( float ) + + sizeof( int ) ) } langevin_pipeline_args_t; // PROTOTYPE_PIPELINE( langevin, langevin_pipeline_args_t ); diff --git a/src/collision/langevin.h b/src/collision/langevin.h index c8dbbaf2..f82ba786 100644 --- a/src/collision/langevin.h +++ b/src/collision/langevin.h @@ -5,14 +5,13 @@ typedef struct langevin { - species_t * sp; - rng_pool_t * rp; - float kT; - float nu; - int interval; + species_t* sp; + rng_pool_t* rp; + float kT; + float nu; + int interval; } langevin_t; -void -apply_langevin_pipeline( langevin_t * l ); +void apply_langevin_pipeline( langevin_t* l ); #endif /* _langevin_h_ */ diff --git a/src/collision/pipeline/collision_pipeline.h b/src/collision/pipeline/collision_pipeline.h index b30f4d23..44777c90 100644 --- a/src/collision/pipeline/collision_pipeline.h +++ b/src/collision/pipeline/collision_pipeline.h @@ -3,27 +3,19 @@ #include "../binary.h" #include "../langevin.h" -#include "../unary.h" #include "../takizuka_abe.h" +#include "../unary.h" -void -binary_pipeline_scalar( binary_collision_model_t * RESTRICT cm, - int pipeline_rank, - int n_pipeline ); +void binary_pipeline_scalar( binary_collision_model_t* RESTRICT cm, + int pipeline_rank, int n_pipeline ); -void -langevin_pipeline_scalar( langevin_pipeline_args_t * RESTRICT args, - int pipeline_rank, - int n_pipeline ); +void langevin_pipeline_scalar( langevin_pipeline_args_t* RESTRICT args, + int pipeline_rank, int n_pipeline ); -void -unary_pipeline_scalar( unary_collision_model_t * RESTRICT cm, - int pipeline_rank, - int n_pipeline ); +void unary_pipeline_scalar( unary_collision_model_t* RESTRICT cm, + int pipeline_rank, int n_pipeline ); -void -takizuka_abe_pipeline_scalar( takizuka_abe_t * RESTRICT cm, - int pipeline_rank, - int n_pipeline ); +void takizuka_abe_pipeline_scalar( takizuka_abe_t* RESTRICT cm, + int pipeline_rank, int n_pipeline ); #endif /* _collision_pipeline_h_ */ diff --git a/src/collision/takizuka_abe.h b/src/collision/takizuka_abe.h index 57d04553..be0e78d7 100644 --- a/src/collision/takizuka_abe.h +++ b/src/collision/takizuka_abe.h @@ -3,16 +3,16 @@ #include "collision_private.h" -typedef struct takizuka_abe { - char * name; - species_t * spi; - species_t * spj; - rng_pool_t * rp; - int interval; - double cvar0; // Base cvar0, which will later be scaled by q and mu +typedef struct takizuka_abe +{ + char* name; + species_t* spi; + species_t* spj; + rng_pool_t* rp; + int interval; + double cvar0; // Base cvar0, which will later be scaled by q and mu } takizuka_abe_t; -void -apply_takizuka_abe_pipeline( takizuka_abe_t * l ); +void apply_takizuka_abe_pipeline( takizuka_abe_t* l ); #endif /* _takizuka_abe_h_ */ diff --git a/src/collision/unary.h b/src/collision/unary.h index c14b92b9..de04e427 100644 --- a/src/collision/unary.h +++ b/src/collision/unary.h @@ -5,17 +5,16 @@ typedef struct unary_collision_model { - char * name; - unary_rate_constant_func_t rate_constant; - unary_collision_func_t collision; - void * params; - species_t * sp; - rng_pool_t * rp; - int interval; - int n_large_pr[ MAX_PIPELINE ]; + char* name; + unary_rate_constant_func_t rate_constant; + unary_collision_func_t collision; + void* params; + species_t* sp; + rng_pool_t* rp; + int interval; + int n_large_pr[MAX_PIPELINE]; } unary_collision_model_t; -void -apply_unary_collision_model_pipeline( unary_collision_model_t * cm ); +void apply_unary_collision_model_pipeline( unary_collision_model_t* cm ); #endif /* _unary_h_ */ diff --git a/src/emitter/emitter.h b/src/emitter/emitter.h index 4efc8c4c..ccb14ee9 100644 --- a/src/emitter/emitter.h +++ b/src/emitter/emitter.h @@ -23,57 +23,50 @@ typedef struct emitter emitter_t; // a (-1:1,-1:1,-1:1) FORTRAN style indexing calculation. Note that // this allows distinctions like which side of a cell a face is on. -#define COMPONENT_ID( local_cell, component_type ) \ - (((local_cell)<<5) | (component_type)) -#define EXTRACT_LOCAL_CELL( component_id ) ((component_id)>>5) -#define EXTRACT_COMPONENT_TYPE( component_id ) ((component_id)&31) +#define COMPONENT_ID( local_cell, component_type ) \ + ( ( ( local_cell ) << 5 ) | ( component_type ) ) +#define EXTRACT_LOCAL_CELL( component_id ) ( ( component_id ) >> 5 ) +#define EXTRACT_COMPONENT_TYPE( component_id ) ( (component_id)&31 ) BEGIN_C_DECLS // In emitter.c -int -num_emitter( const emitter_t * e_list ); +int num_emitter( const emitter_t* e_list ); -void -apply_emitter_list( emitter_t * e_list ); +void apply_emitter_list( emitter_t* e_list ); -void -delete_emitter_list( emitter_t * e_list ); +void delete_emitter_list( emitter_t* e_list ); // Note that this append is hacked to silently return if the given // emitter is already part of the list. This allows the emitter // initialization in vpic.h / deck_wrappers.cxx to get around -// some limitations of strict C++. +// some limitations of strict C++. -emitter_t * -append_emitter( emitter_t * e, - emitter_t ** e_list ); +emitter_t* append_emitter( emitter_t* e, emitter_t** e_list ); // Each emitter must be sized once and only once. Returns // the buffer were the emitter components should be stored. -int32_t * ALIGNED(128) -size_emitter( emitter_t * e, - int n_component ); +int32_t* ALIGNED( 128 ) size_emitter( emitter_t* e, int n_component ); // In child-langmuir.c -#define CHILD_LANGMUIR sqrt(32./81.) -#define CCUBE sqrt(1./6.) -#define IVORY sqrt(1./6.) - -emitter_t * -child_langmuir( /**/ species_t * RESTRICT sp, // Species to emit - const interpolator_array_t * RESTRICT ia, // For field interpolation - /**/ field_array_t * RESTRICT fa, // For rhob accum (inject) - /**/ accumulator_array_t * RESTRICT aa, // For Jf accum (aging) - /**/ rng_pool_t * RESTRICT rp, // Random number pool - int n_emit_per_face, // Particles to emit per face per step - float ut_perp, // Perpendicular normalized thermal momentum - float ut_para, // Parallel normalized thermal momentum - float thresh_e_norm, // Only emit if E_norm>thresh_e_norm - float norm ); // Child-langmuir normalization +#define CHILD_LANGMUIR sqrt( 32. / 81. ) +#define CCUBE sqrt( 1. / 6. ) +#define IVORY sqrt( 1. / 6. ) + +emitter_t* child_langmuir( + /**/ species_t* RESTRICT sp, // Species to emit + const interpolator_array_t* RESTRICT ia, // For field interpolation + /**/ field_array_t* RESTRICT fa, // For rhob accum (inject) + /**/ accumulator_array_t* RESTRICT aa, // For Jf accum (aging) + /**/ rng_pool_t* RESTRICT rp, // Random number pool + int n_emit_per_face, // Particles to emit per face per step + float ut_perp, // Perpendicular normalized thermal momentum + float ut_para, // Parallel normalized thermal momentum + float thresh_e_norm, // Only emit if E_norm>thresh_e_norm + float norm ); // Child-langmuir normalization END_C_DECLS diff --git a/src/emitter/emitter_private.h b/src/emitter/emitter_private.h index e54ccea4..dd1159fe 100644 --- a/src/emitter/emitter_private.h +++ b/src/emitter/emitter_private.h @@ -7,43 +7,36 @@ #include "emitter.h" -typedef void -(*emit_func_t)( /**/ void * RESTRICT params, - const int * RESTRICT ALIGNED(128) component, - int n_component ); - -typedef void -(*delete_emitter_func_t)( emitter_t * RESTRICT e ); - -struct emitter { - void * params; - emit_func_t emit; - delete_emitter_func_t delete_e; - int * ALIGNED(128) component; - int n_component; - emitter_t * next; +typedef void ( *emit_func_t )( /**/ void* RESTRICT params, + const int* RESTRICT ALIGNED( 128 ) component, + int n_component ); + +typedef void ( *delete_emitter_func_t )( emitter_t* RESTRICT e ); + +struct emitter +{ + void* params; + emit_func_t emit; + delete_emitter_func_t delete_e; + int* ALIGNED( 128 ) component; + int n_component; + emitter_t* next; }; BEGIN_C_DECLS // In emitter.c -void -checkpt_emitter_internal( const emitter_t * e ); +void checkpt_emitter_internal( const emitter_t* e ); -emitter_t * -restore_emitter_internal( void * params ); +emitter_t* restore_emitter_internal( void* params ); -emitter_t * -new_emitter_internal( void * params, - emit_func_t emit, - delete_emitter_func_t delete_e, - checkpt_func_t checkpt, - restore_func_t restore, - reanimate_func_t reanimate ); +emitter_t* new_emitter_internal( void* params, emit_func_t emit, + delete_emitter_func_t delete_e, + checkpt_func_t checkpt, restore_func_t restore, + reanimate_func_t reanimate ); -void -delete_emitter_internal( emitter_t * e ); +void delete_emitter_internal( emitter_t* e ); END_C_DECLS diff --git a/src/field_advance/field_advance.h b/src/field_advance/field_advance.h index d1cee710..3b5bf40b 100644 --- a/src/field_advance/field_advance.h +++ b/src/field_advance/field_advance.h @@ -13,7 +13,7 @@ // // This module implements the following the difference equations on a // superhexahedral domain decomposed Yee-mesh: -// +// // advance_b -> Finite Differenced Faraday // cB_new = cB_old - frac c dt curl E // @@ -32,7 +32,7 @@ // rapidly reduce RMS divergence error assuming divergences errors // are due to accumulation of numerical roundoff when integrating // Faraday. See clean_div.c for details. -// +// // div_clean_e -> Modified Marder pass on electric fields // E_new = E_old + drive D dt grad err_mul div ( epsr E_old - rho/eps0 ) // Since the total rho may not be known everywhere (for example in @@ -65,7 +65,7 @@ // fmatx,fmaty,fmatz are all on the "face // mesh". rhof,rhob,div_e_err,nmat are on the "nodes mesh". // div_b_err,cmat are on the "cell mesh". -// +// // Above, for "edge mesh" quantities, interior means that the // component is not a tangential field directly on the surface of the // domain. For "face mesh" quantities, interior means that the @@ -97,7 +97,7 @@ // ... // material_coefficients = new_material_coefficients(grid,material_list); // fields = new_fields(grid); -// +// // ... Set the initial field values and place materials ... // // synchronize_fields(fields,grid); @@ -107,7 +107,7 @@ // initial fields or errors in the source terms or different floating // point properties on different nodes cause the shared faces to have // different fields). -// +// // To advance the fields in a PIC simulation with TCA radation damping // and periodic divergence cleaning, the following sequence is // suggested: @@ -118,7 +118,7 @@ // if( should_clean_div_e ) { // ... adjust rho_f, rho_b and/or rho_c as necessary // do { -// rms_err = clean_div_e( fields, material_coefficients, grid ); +// rms_err = clean_div_e( fields, material_coefficients, grid ); // } while( rms_err_too_high ); // } // if( should_clean_div_b ) { @@ -151,12 +151,12 @@ typedef struct field { - float ex, ey, ez, div_e_err; // Electric field and div E error - float cbx, cby, cbz, div_b_err; // Magnetic field and div B error - float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density - float jfx, jfy, jfz, rhof; // Free current and charge density - material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes - material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers + float ex, ey, ez, div_e_err; // Electric field and div E error + float cbx, cby, cbz, div_b_err; // Magnetic field and div B error + float tcax, tcay, tcaz, rhob; // TCA fields and bound charge density + float jfx, jfy, jfz, rhof; // Free current and charge density + material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes + material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers } field_t; // field_advance_kernels holds all the function pointers to all the @@ -169,51 +169,51 @@ struct field_array; typedef struct field_advance_kernels { - // FIXME: DUMP.CXX SHOULD BE DECENTRALIZED AND DIAGNOSTIC DUMP - // FOR FIELDS SHOULD BE ADDED TO THIS - // FIXME: FOR SYSTEMS WITH MAGNETIC CURRENTS (E.G. PML LAYERS) - // WOULD INTERFACES FOR xif,kf BE USEFUL? + // FIXME: DUMP.CXX SHOULD BE DECENTRALIZED AND DIAGNOSTIC DUMP + // FOR FIELDS SHOULD BE ADDED TO THIS + // FIXME: FOR SYSTEMS WITH MAGNETIC CURRENTS (E.G. PML LAYERS) + // WOULD INTERFACES FOR xif,kf BE USEFUL? - void (*delete_fa)( struct field_array * RESTRICT fa ); + void ( *delete_fa )( struct field_array* RESTRICT fa ); - // Time stepping interface + // Time stepping interface - void (*advance_b)( struct field_array * RESTRICT fa, float frac ); - void (*advance_e)( struct field_array * RESTRICT fa, float frac ); + void ( *advance_b )( struct field_array* RESTRICT fa, float frac ); + void ( *advance_e )( struct field_array* RESTRICT fa, float frac ); - // Diagnostic interface - // FIXME: MAY NEED MORE CAREFUL THOUGHT FOR CURVILINEAR SYSTEMS + // Diagnostic interface + // FIXME: MAY NEED MORE CAREFUL THOUGHT FOR CURVILINEAR SYSTEMS - void (*energy_f)( /**/ double * RESTRICT en, // 6 elem - const struct field_array * RESTRICT fa ); + void ( *energy_f )( /**/ double* RESTRICT en, // 6 elem + const struct field_array* RESTRICT fa ); - // Accumulator interface + // Accumulator interface - void (*clear_jf )( struct field_array * RESTRICT fa ); - void (*synchronize_jf )( struct field_array * RESTRICT fa ); - void (*clear_rhof )( struct field_array * RESTRICT fa ); - void (*synchronize_rho)( struct field_array * RESTRICT fa ); + void ( *clear_jf )( struct field_array* RESTRICT fa ); + void ( *synchronize_jf )( struct field_array* RESTRICT fa ); + void ( *clear_rhof )( struct field_array* RESTRICT fa ); + void ( *synchronize_rho )( struct field_array* RESTRICT fa ); - // Initialization interface + // Initialization interface - void (*compute_rhob )( struct field_array * RESTRICT fa ); - void (*compute_curl_b)( struct field_array * RESTRICT fa ); + void ( *compute_rhob )( struct field_array* RESTRICT fa ); + void ( *compute_curl_b )( struct field_array* RESTRICT fa ); - // Local/remote shared face cleaning + // Local/remote shared face cleaning - double (*synchronize_tang_e_norm_b)( struct field_array * RESTRICT fa ); + double ( *synchronize_tang_e_norm_b )( struct field_array* RESTRICT fa ); - // Electric field divergence cleaning interface + // Electric field divergence cleaning interface - void (*compute_div_e_err )( /**/ struct field_array * RESTRICT fa ); - double (*compute_rms_div_e_err)( const struct field_array * RESTRICT fa ); - void (*clean_div_e )( /**/ struct field_array * RESTRICT fa ); + void ( *compute_div_e_err )( /**/ struct field_array* RESTRICT fa ); + double ( *compute_rms_div_e_err )( const struct field_array* RESTRICT fa ); + void ( *clean_div_e )( /**/ struct field_array* RESTRICT fa ); - // Magnetic field divergence cleaning interface + // Magnetic field divergence cleaning interface - void (*compute_div_b_err )( /**/ struct field_array * RESTRICT fa ); - double (*compute_rms_div_b_err)( const struct field_array * RESTRICT fa ); - void (*clean_div_b )( /**/ struct field_array * RESTRICT fa ); + void ( *compute_div_b_err )( /**/ struct field_array* RESTRICT fa ); + double ( *compute_rms_div_b_err )( const struct field_array* RESTRICT fa ); + void ( *clean_div_b )( /**/ struct field_array* RESTRICT fa ); } field_advance_kernels_t; @@ -222,21 +222,19 @@ typedef struct field_advance_kernels typedef struct field_array { - field_t * ALIGNED(128) f; // Local field data - grid_t * g; // Underlying grid - void * params; // Field advance specific parameters - field_advance_kernels_t kernel[1]; // Field advance kernels + field_t* ALIGNED( 128 ) f; // Local field data + grid_t* g; // Underlying grid + void* params; // Field advance specific parameters + field_advance_kernels_t kernel[1]; // Field advance kernels } field_array_t; BEGIN_C_DECLS -field_array_t * -new_standard_field_array( grid_t * RESTRICT g, - const material_t * RESTRICT m_list, - float damp ); +field_array_t* new_standard_field_array( grid_t* RESTRICT g, + const material_t* RESTRICT m_list, + float damp ); -void -delete_field_array( field_array_t * fa ); +void delete_field_array( field_array_t* fa ); END_C_DECLS diff --git a/src/field_advance/field_advance_private.h b/src/field_advance/field_advance_private.h index 740bf9f0..f1d59fb0 100644 --- a/src/field_advance/field_advance_private.h +++ b/src/field_advance/field_advance_private.h @@ -11,13 +11,11 @@ BEGIN_C_DECLS /* Checkpoint the symbols for the given field advance kernel */ -void -checkpt_field_advance_kernels( const field_advance_kernels_t * kernel ); +void checkpt_field_advance_kernels( const field_advance_kernels_t* kernel ); /* Checkpoint the symbols for the given field advance kernel */ -void -restore_field_advance_kernels( field_advance_kernels_t * kernel ); +void restore_field_advance_kernels( field_advance_kernels_t* kernel ); END_C_DECLS diff --git a/src/field_advance/standard/pipeline/advance_b_pipeline.h b/src/field_advance/standard/pipeline/advance_b_pipeline.h index 132d8419..2df10676 100644 --- a/src/field_advance/standard/pipeline/advance_b_pipeline.h +++ b/src/field_advance/standard/pipeline/advance_b_pipeline.h @@ -9,73 +9,76 @@ typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const grid_t * g; - float frac; + field_t* ALIGNED( 128 ) f; + const grid_t* g; + float frac; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const grid_t * g = args->g; \ - \ - const int nx = g->nx; \ - const int ny = g->ny; \ - const int nz = g->nz; \ - \ - const float frac = args->frac; \ - const float px = (nx>1) ? frac*g->cvac*g->dt*g->rdx : 0; \ - const float py = (ny>1) ? frac*g->cvac*g->dt*g->rdy : 0; \ - const float pz = (nz>1) ? frac*g->cvac*g->dt*g->rdz : 0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const grid_t* g = args->g; \ + \ + const int nx = g->nx; \ + const int ny = g->ny; \ + const int nz = g->nz; \ + \ + const float frac = args->frac; \ + const float px = ( nx > 1 ) ? frac * g->cvac * g->dt * g->rdx : 0; \ + const float py = ( ny > 1 ) ? frac * g->cvac * g->dt * g->rdy : 0; \ + const float pz = ( nz > 1 ) ? frac * g->cvac * g->dt * g->rdz : 0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x+1, y, z ); \ - fy = &f( x, y+1, z ); \ - fz = &f( x, y, z+1 ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x + 1, y, z ); \ + fy = &f( x, y + 1, z ); \ + fz = &f( x, y, z + 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + y++; \ + x = 1; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 1; \ + INIT_STENCIL(); \ + } -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - y++; x = 1; \ - if ( y > ny ) z++; if ( y > ny ) y = 1; \ - INIT_STENCIL(); \ - } - // WTF! Under -ffast-math, gcc-4.1.1 thinks it is okay to treat the // below as // f0->cbx = ( f0->cbx + py*( blah ) ) - pz*( blah ) // even with explicit parenthesis are in there! Oh my ... // -fno-unsafe-math-optimizations must be used -#define UPDATE_CBX() f0->cbx -= ( py*( fy->ez-f0->ez ) - pz*( fz->ey-f0->ey ) ) -#define UPDATE_CBY() f0->cby -= ( pz*( fz->ex-f0->ex ) - px*( fx->ez-f0->ez ) ) -#define UPDATE_CBZ() f0->cbz -= ( px*( fx->ey-f0->ey ) - py*( fy->ex-f0->ex ) ) +#define UPDATE_CBX() \ + f0->cbx -= ( py * ( fy->ez - f0->ez ) - pz * ( fz->ey - f0->ey ) ) +#define UPDATE_CBY() \ + f0->cby -= ( pz * ( fz->ex - f0->ex ) - px * ( fx->ez - f0->ez ) ) +#define UPDATE_CBZ() \ + f0->cbz -= ( px * ( fx->ey - f0->ey ) - py * ( fy->ex - f0->ex ) ) -void -advance_b_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_b_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_b_pipeline_v4( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_b_pipeline_v4( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_b_pipeline_v8( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_b_pipeline_v8( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_b_pipeline_v16( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_b_pipeline_v16( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _advance_b_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/advance_e_pipeline.h b/src/field_advance/standard/pipeline/advance_e_pipeline.h index 32b84ff1..228e4fba 100644 --- a/src/field_advance/standard/pipeline/advance_e_pipeline.h +++ b/src/field_advance/standard/pipeline/advance_e_pipeline.h @@ -9,89 +9,89 @@ typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float damp = args->p->damp; \ - const float px = (nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0; \ - const float py = (ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0; \ - const float pz = (nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0; \ - const float cj = g->dt/g->eps0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float damp = args->p->damp; \ + const float px = ( nx > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdx : 0; \ + const float py = ( ny > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdy : 0; \ + const float pz = ( nz > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdz : 0; \ + const float cj = g->dt / g->eps0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x-1, y, z ); \ - fy = &f( x, y-1, z ); \ - fz = &f( x, y, z-1 ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - y++; x = 2; \ - if ( y > ny ) z++; if ( y > ny ) y = 2; \ - INIT_STENCIL(); \ - } +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } -#define UPDATE_EX() \ - f0->tcax = ( py * ( f0->cbz * m[f0->fmatz].rmuz - \ - fy->cbz * m[fy->fmatz].rmuz ) - \ - pz * ( f0->cby * m[f0->fmaty].rmuy - \ - fz->cby * m[fz->fmaty].rmuy ) ) - \ - damp * f0->tcax; \ - f0->ex = m[f0->ematx].decayx * f0->ex + \ +#define UPDATE_EX() \ + f0->tcax = \ + ( py * ( f0->cbz * m[f0->fmatz].rmuz - fy->cbz * m[fy->fmatz].rmuz ) - \ + pz * ( f0->cby * m[f0->fmaty].rmuy - \ + fz->cby * m[fz->fmaty].rmuy ) ) - \ + damp * f0->tcax; \ + f0->ex = m[f0->ematx].decayx * f0->ex + \ m[f0->ematx].drivex * ( f0->tcax - cj * f0->jfx ) -#define UPDATE_EY() \ - f0->tcay = ( pz * ( f0->cbx * m[f0->fmatx].rmux - \ - fz->cbx * m[fz->fmatx].rmux ) - \ - px * ( f0->cbz * m[f0->fmatz].rmuz - \ - fx->cbz * m[fx->fmatz].rmuz ) ) - \ - damp * f0->tcay; \ - f0->ey = m[f0->ematy].decayy * f0->ey + \ +#define UPDATE_EY() \ + f0->tcay = \ + ( pz * ( f0->cbx * m[f0->fmatx].rmux - fz->cbx * m[fz->fmatx].rmux ) - \ + px * ( f0->cbz * m[f0->fmatz].rmuz - \ + fx->cbz * m[fx->fmatz].rmuz ) ) - \ + damp * f0->tcay; \ + f0->ey = m[f0->ematy].decayy * f0->ey + \ m[f0->ematy].drivey * ( f0->tcay - cj * f0->jfy ) -#define UPDATE_EZ() \ - f0->tcaz = ( px * ( f0->cby * m[f0->fmaty].rmuy - \ - fx->cby * m[fx->fmaty].rmuy ) - \ - py * ( f0->cbx * m[f0->fmatx].rmux - \ - fy->cbx * m[fy->fmatx].rmux ) ) - \ - damp * f0->tcaz; \ - f0->ez = m[f0->ematz].decayz * f0->ez + \ +#define UPDATE_EZ() \ + f0->tcaz = \ + ( px * ( f0->cby * m[f0->fmaty].rmuy - fx->cby * m[fx->fmaty].rmuy ) - \ + py * ( f0->cbx * m[f0->fmatx].rmux - \ + fy->cbx * m[fy->fmatx].rmux ) ) - \ + damp * f0->tcaz; \ + f0->ez = m[f0->ematz].decayz * f0->ez + \ m[f0->ematz].drivez * ( f0->tcaz - cj * f0->jfz ) -void -advance_e_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_e_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_e_pipeline_v4( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_e_pipeline_v4( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_e_pipeline_v8( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_e_pipeline_v8( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_e_pipeline_v16( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_e_pipeline_v16( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _advance_e_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/clean_div_b_pipeline.h b/src/field_advance/standard/pipeline/clean_div_b_pipeline.h index 4b68c433..277ef0d6 100644 --- a/src/field_advance/standard/pipeline/clean_div_b_pipeline.h +++ b/src/field_advance/standard/pipeline/clean_div_b_pipeline.h @@ -2,41 +2,34 @@ #define _clean_div_b_pipeline_h_ #ifndef IN_clean_div_b_pipeline -#error "Only include clean_div_b_pipeline.h in clean_div_b_pipeline source files." +#error \ + "Only include clean_div_b_pipeline.h in clean_div_b_pipeline source files." #endif #include "../../field_advance.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const grid_t* g; } pipeline_args_t; -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define MARDER_CBX() f0->cbx += px*( f0->div_b_err - fx->div_b_err ) -#define MARDER_CBY() f0->cby += py*( f0->div_b_err - fy->div_b_err ) -#define MARDER_CBZ() f0->cbz += pz*( f0->div_b_err - fz->div_b_err ) +#define MARDER_CBX() f0->cbx += px * ( f0->div_b_err - fx->div_b_err ) +#define MARDER_CBY() f0->cby += py * ( f0->div_b_err - fy->div_b_err ) +#define MARDER_CBZ() f0->cbz += pz * ( f0->div_b_err - fz->div_b_err ) -void -clean_div_b_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void clean_div_b_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -clean_div_b_pipeline_v4( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void clean_div_b_pipeline_v4( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -clean_div_b_pipeline_v8( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void clean_div_b_pipeline_v8( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -clean_div_b_pipeline_v16( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void clean_div_b_pipeline_v16( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _clean_div_b_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/clean_div_e_pipeline.h b/src/field_advance/standard/pipeline/clean_div_e_pipeline.h index bfd7b154..2eaae616 100644 --- a/src/field_advance/standard/pipeline/clean_div_e_pipeline.h +++ b/src/field_advance/standard/pipeline/clean_div_e_pipeline.h @@ -2,62 +2,71 @@ #define _clean_div_e_pipeline_h_ #ifndef IN_clean_div_e_pipeline -#error "Only include clean_div_e_pipeline.h in clean_div_e_pipeline source files." +#error \ + "Only include clean_div_e_pipeline.h in clean_div_e_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float _rdx = (nx>1) ? g->rdx : 0; \ - const float _rdy = (ny>1) ? g->rdy : 0; \ - const float _rdz = (nz>1) ? g->rdz : 0; \ - const float alphadt = 0.3888889/( _rdx*_rdx + _rdy*_rdy + _rdz*_rdz ); \ - const float px = alphadt*_rdx; \ - const float py = alphadt*_rdy; \ - const float pz = alphadt*_rdz; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z - -#define f(x,y,z) f[ VOXEL(x,y,z,nx,ny,nz) ] - -#define INIT_STENCIL() \ - f0 = &f(x, y, z ); \ - fx = &f(x+1,y, z ); \ - fy = &f(x, y+1,z ); \ - fz = &f(x, y, z+1) - -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if( x>nx ) { \ - /**/ y++; x = 1; \ - if( y>ny ) z++; if( y>ny ) y = 1; \ - INIT_STENCIL(); \ - } - -#define MARDER_EX() \ - f0->ex += m[f0->ematx].drivex*px*(fx->div_e_err-f0->div_e_err) -#define MARDER_EY() \ - f0->ey += m[f0->ematy].drivey*py*(fy->div_e_err-f0->div_e_err) -#define MARDER_EZ() \ - f0->ez += m[f0->ematz].drivez*pz*(fz->div_e_err-f0->div_e_err) - -static void -clean_div_e_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float _rdx = ( nx > 1 ) ? g->rdx : 0; \ + const float _rdy = ( ny > 1 ) ? g->rdy : 0; \ + const float _rdz = ( nz > 1 ) ? g->rdz : 0; \ + const float alphadt = \ + 0.3888889 / ( _rdx * _rdx + _rdy * _rdy + _rdz * _rdz ); \ + const float px = alphadt * _rdx; \ + const float py = alphadt * _rdy; \ + const float pz = alphadt * _rdz; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z + +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] + +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x + 1, y, z ); \ + fy = &f( x, y + 1, z ); \ + fz = &f( x, y, z + 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 1; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 1; \ + INIT_STENCIL(); \ + } + +#define MARDER_EX() \ + f0->ex += m[f0->ematx].drivex * px * ( fx->div_e_err - f0->div_e_err ) +#define MARDER_EY() \ + f0->ey += m[f0->ematy].drivey * py * ( fy->div_e_err - f0->div_e_err ) +#define MARDER_EZ() \ + f0->ez += m[f0->ematz].drivez * pz * ( fz->div_e_err - f0->div_e_err ) + +static void clean_div_e_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); #endif // _clean_div_e_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h b/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h index 80e03c6a..baefcbc0 100644 --- a/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h +++ b/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h @@ -2,85 +2,83 @@ #define _compute_curl_b_pipeline_h_ #ifndef IN_compute_curl_b_pipeline -#error "Only include compute_curl_b_pipeline.h in compute_curl_b_pipeline source files." +#error \ + "Only include compute_curl_b_pipeline.h in compute_curl_b_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float px = (nx>1) ? g->cvac*g->dt*g->rdx : 0; \ - const float py = (ny>1) ? g->cvac*g->dt*g->rdy : 0; \ - const float pz = (nz>1) ? g->cvac*g->dt*g->rdz : 0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float px = ( nx > 1 ) ? g->cvac * g->dt * g->rdx : 0; \ + const float py = ( ny > 1 ) ? g->cvac * g->dt * g->rdy : 0; \ + const float pz = ( nz > 1 ) ? g->cvac * g->dt * g->rdz : 0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x-1, y, z ); \ - fy = &f( x, y-1, z ); \ - fz = &f( x, y, z-1 ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - y++; x = 2; \ - if ( y > ny ) z++; if ( y > ny ) y = 2; \ - INIT_STENCIL(); \ - } +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } -#define UPDATE_EX() \ - f0->tcax = ( py * ( f0->cbz * m[f0->fmatz].rmuz - \ - fy->cbz * m[fy->fmatz].rmuz ) - \ - pz * ( f0->cby * m[f0->fmaty].rmuy - \ - fz->cby * m[fz->fmaty].rmuy ) ) +#define UPDATE_EX() \ + f0->tcax = \ + ( py * ( f0->cbz * m[f0->fmatz].rmuz - fy->cbz * m[fy->fmatz].rmuz ) - \ + pz * ( f0->cby * m[f0->fmaty].rmuy - fz->cby * m[fz->fmaty].rmuy ) ) -#define UPDATE_EY() \ - f0->tcay = ( pz * ( f0->cbx * m[f0->fmatx].rmux - \ - fz->cbx * m[fz->fmatx].rmux ) - \ - px * ( f0->cbz * m[f0->fmatz].rmuz - \ - fx->cbz * m[fx->fmatz].rmuz ) ) +#define UPDATE_EY() \ + f0->tcay = \ + ( pz * ( f0->cbx * m[f0->fmatx].rmux - fz->cbx * m[fz->fmatx].rmux ) - \ + px * ( f0->cbz * m[f0->fmatz].rmuz - fx->cbz * m[fx->fmatz].rmuz ) ) -#define UPDATE_EZ() \ - f0->tcaz = ( px * ( f0->cby * m[f0->fmaty].rmuy - \ - fx->cby * m[fx->fmaty].rmuy ) - \ - py * ( f0->cbx * m[f0->fmatx].rmux - \ - fy->cbx * m[fy->fmatx].rmux ) ) +#define UPDATE_EZ() \ + f0->tcaz = \ + ( px * ( f0->cby * m[f0->fmaty].rmuy - fx->cby * m[fx->fmaty].rmuy ) - \ + py * ( f0->cbx * m[f0->fmatx].rmux - fy->cbx * m[fy->fmatx].rmux ) ) -void -compute_curl_b_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void compute_curl_b_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -compute_curl_b_pipeline_v4( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void compute_curl_b_pipeline_v4( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -compute_curl_b_pipeline_v8( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void compute_curl_b_pipeline_v8( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -compute_curl_b_pipeline_v16( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void compute_curl_b_pipeline_v16( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _compute_curl_b_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h b/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h index 246d4731..071ef83b 100644 --- a/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h +++ b/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h @@ -2,22 +2,21 @@ #define _compute_div_b_err_pipeline_h_ #ifndef IN_compute_div_b_err_pipeline -#error "Only include compute_div_b_err_pipeline.h in compute_div_b_err_pipeline source files." +#error \ + "Only include compute_div_b_err_pipeline.h in compute_div_b_err_pipeline source files." #endif #include "../../field_advance.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const grid_t* g; } pipeline_args_t; -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -void -compute_div_b_err_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void compute_div_b_err_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); #endif // _compute_div_b_err_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h index 9a0c1472..431d9ac3 100644 --- a/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h +++ b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h @@ -2,58 +2,68 @@ #define _compute_div_e_err_pipeline_h_ #ifndef IN_compute_div_e_err_pipeline -#error "Only include compute_div_e_err_pipeline.h in compute_div_e_err_pipeline source files." +#error \ + "Only include compute_div_e_err_pipeline.h in compute_div_e_err_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - /**/ field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + /**/ field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - /**/ field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float px = (nx>1) ? g->rdx : 0; \ - const float py = (ny>1) ? g->rdy : 0; \ - const float pz = (nz>1) ? g->rdz : 0; \ - const float cj = 1./g->eps0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z - -#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ] - -#define INIT_STENCIL() \ - f0 = &f(x, y, z ); \ - fx = &f(x-1,y, z ); \ - fy = &f(x, y-1,z ); \ - fz = &f(x, y, z-1) - -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if( x>nx ) { \ - /**/ y++; x = 2; \ - if( y>ny ) z++; if( y>ny ) y = 2; \ - INIT_STENCIL(); \ - } - -#define UPDATE_DERR_E() f0->div_e_err = m[f0->nmat].nonconductive * \ - ( px*( m[f0->ematx].epsx*f0->ex - m[fx->ematx].epsx*fx->ex ) + \ - py*( m[f0->ematy].epsy*f0->ey - m[fy->ematy].epsy*fy->ey ) + \ - pz*( m[f0->ematz].epsz*f0->ez - m[fz->ematz].epsz*fz->ez ) - \ - cj*( f0->rhof + f0->rhob ) ) - -void -compute_div_e_err_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +#define DECLARE_STENCIL() \ + /**/ field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float px = ( nx > 1 ) ? g->rdx : 0; \ + const float py = ( ny > 1 ) ? g->rdy : 0; \ + const float pz = ( nz > 1 ) ? g->rdz : 0; \ + const float cj = 1. / g->eps0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z + +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] + +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } + +#define UPDATE_DERR_E() \ + f0->div_e_err = \ + m[f0->nmat].nonconductive * \ + ( px * ( m[f0->ematx].epsx * f0->ex - m[fx->ematx].epsx * fx->ex ) + \ + py * ( m[f0->ematy].epsy * f0->ey - m[fy->ematy].epsy * fy->ey ) + \ + pz * ( m[f0->ematz].epsz * f0->ez - m[fz->ematz].epsz * fz->ez ) - \ + cj * ( f0->rhof + f0->rhob ) ) + +void compute_div_e_err_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); #endif // _compute_div_e_err_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/compute_rhob_pipeline.h b/src/field_advance/standard/pipeline/compute_rhob_pipeline.h index 33f94308..c6217b33 100644 --- a/src/field_advance/standard/pipeline/compute_rhob_pipeline.h +++ b/src/field_advance/standard/pipeline/compute_rhob_pipeline.h @@ -2,57 +2,67 @@ #define _compute_rhob_pipeline_h_ #ifndef IN_compute_rhob_pipeline -#error "Only include compute_rhob_pipeline.h in compute_rhob_pipeline source files." +#error \ + "Only include compute_rhob_pipeline.h in compute_rhob_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - /**/ field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + /**/ field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - /**/ field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float px = (nx>1) ? g->eps0*g->rdx : 0; \ - const float py = (ny>1) ? g->eps0*g->rdy : 0; \ - const float pz = (nz>1) ? g->eps0*g->rdz : 0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z - -#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ] - -#define INIT_STENCIL() \ - f0 = &f(x, y, z ); \ - fx = &f(x-1,y, z ); \ - fy = &f(x, y-1,z ); \ - fz = &f(x, y, z-1) - -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if( x>nx ) { \ - /**/ y++; x = 2; \ - if( y>ny ) z++; if( y>ny ) y = 2; \ - INIT_STENCIL(); \ - } - -#define UPDATE_DERR_E() f0->rhob = m[f0->nmat].nonconductive * \ - ( px*( m[f0->ematx].epsx*f0->ex - m[fx->ematx].epsx*fx->ex ) + \ - py*( m[f0->ematy].epsy*f0->ey - m[fy->ematy].epsy*fy->ey ) + \ - pz*( m[f0->ematz].epsz*f0->ez - m[fz->ematz].epsz*fz->ez ) - \ - f0->rhof ) - -void -compute_rhob_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +#define DECLARE_STENCIL() \ + /**/ field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float px = ( nx > 1 ) ? g->eps0 * g->rdx : 0; \ + const float py = ( ny > 1 ) ? g->eps0 * g->rdy : 0; \ + const float pz = ( nz > 1 ) ? g->eps0 * g->rdz : 0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z + +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] + +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } + +#define UPDATE_DERR_E() \ + f0->rhob = \ + m[f0->nmat].nonconductive * \ + ( px * ( m[f0->ematx].epsx * f0->ex - m[fx->ematx].epsx * fx->ex ) + \ + py * ( m[f0->ematy].epsy * f0->ey - m[fy->ematy].epsy * fy->ey ) + \ + pz * ( m[f0->ematz].epsz * f0->ez - m[fz->ematz].epsz * fz->ez ) - \ + f0->rhof ) + +void compute_rhob_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _compute_rhob_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h index c95803df..df719ed0 100644 --- a/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h +++ b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h @@ -2,23 +2,23 @@ #define _compute_rms_div_b_err_pipeline_h_ #ifndef IN_compute_rms_div_b_err_pipeline -#error "Only include compute_rms_div_b_err_pipeline.h in compute_rms_div_b_err_pipeline source files." +#error \ + "Only include compute_rms_div_b_err_pipeline.h in compute_rms_div_b_err_pipeline source files." #endif #include "../../field_advance.h" typedef struct pipeline_args { - const field_t * ALIGNED(128) f; - const grid_t * g; - double err[MAX_PIPELINE+1]; + const field_t* ALIGNED( 128 ) f; + const grid_t* g; + double err[MAX_PIPELINE + 1]; } pipeline_args_t; -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -static void -compute_rms_div_b_err_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +static void compute_rms_div_b_err_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, + int n_pipeline ); #endif // _compute_rms_div_b_err_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h index 02c801ab..9c6f352b 100644 --- a/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h +++ b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h @@ -2,23 +2,23 @@ #define _compute_rms_div_e_err_pipeline_h_ #ifndef IN_compute_rms_div_e_err_pipeline -#error "Only include compute_rms_div_e_err_pipeline.h in compute_rms_div_e_err_pipeline source files." +#error \ + "Only include compute_rms_div_e_err_pipeline.h in compute_rms_div_e_err_pipeline source files." #endif #include "../../field_advance.h" typedef struct pipeline_args { - const field_t * ALIGNED(128) f; - const grid_t * g; - double err[MAX_PIPELINE+1]; + const field_t* ALIGNED( 128 ) f; + const grid_t* g; + double err[MAX_PIPELINE + 1]; } pipeline_args_t; -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -static void -compute_rms_div_e_err_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +static void compute_rms_div_e_err_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, + int n_pipeline ); #endif // _compute_rms_div_e_err_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/energy_f_pipeline.h b/src/field_advance/standard/pipeline/energy_f_pipeline.h index fe9e69f7..062edbb0 100644 --- a/src/field_advance/standard/pipeline/energy_f_pipeline.h +++ b/src/field_advance/standard/pipeline/energy_f_pipeline.h @@ -9,66 +9,76 @@ typedef struct pipeline_args { - const field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; - double en[ MAX_PIPELINE + 1 ][ 6 ]; + const field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; + double en[MAX_PIPELINE + 1][6]; } pipeline_args_t; -#define DECLARE_STENCIL() \ - const field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const field_t * ALIGNED(16) f0; \ - const field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - const field_t * ALIGNED(16) fyz, * ALIGNED(16) fzx, * ALIGNED(16) fxy; \ - double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \ - int x, y, z +#define DECLARE_STENCIL() \ + const field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const field_t* ALIGNED( 16 ) f0; \ + const field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + const field_t *ALIGNED( 16 ) fyz, *ALIGNED( 16 ) fzx, *ALIGNED( 16 ) fxy; \ + double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \ + int x, y, z -#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f(x, y, z ); \ - fx = &f(x+1,y, z ); \ - fy = &f(x, y+1,z ); \ - fz = &f(x, y, z+1); \ - fyz = &f(x, y+1,z+1); \ - fzx = &f(x+1,y, z+1); \ - fxy = &f(x+1,y+1,z ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x + 1, y, z ); \ + fy = &f( x, y + 1, z ); \ + fz = &f( x, y, z + 1 ); \ + fyz = &f( x, y + 1, z + 1 ); \ + fzx = &f( x + 1, y, z + 1 ); \ + fxy = &f( x + 1, y + 1, z ) -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; fyz++; fzx++; fxy++; x++; \ - if( x>nx ) { \ - /**/ y++; x = 1; \ - if( y>ny ) z++; if( y>ny ) y = 1; \ - INIT_STENCIL(); \ - } +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + fyz++; \ + fzx++; \ + fxy++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 1; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 1; \ + INIT_STENCIL(); \ + } -#define REDUCE_EN() \ - en_ex += 0.25*( m[ f0->ematx].epsx* f0->ex * f0->ex + \ - m[ fy->ematx].epsx* fy->ex * fy->ex + \ - m[ fz->ematx].epsx* fz->ex * fz->ex + \ - m[fyz->ematx].epsx*fyz->ex *fyz->ex ); \ - en_ey += 0.25*( m[ f0->ematy].epsy* f0->ey * f0->ey + \ - m[ fz->ematy].epsy* fz->ey * fz->ey + \ - m[ fx->ematy].epsy* fx->ey * fx->ey + \ - m[fzx->ematy].epsy*fzx->ey *fzx->ey ); \ - en_ez += 0.25*( m[ f0->ematz].epsz* f0->ez * f0->ez + \ - m[ fx->ematz].epsz* fx->ez * fx->ez + \ - m[ fy->ematz].epsz* fy->ez * fy->ez + \ - m[fxy->ematz].epsz*fxy->ez *fxy->ez ); \ - en_bx += 0.5 *( m[ f0->fmatx].rmux* f0->cbx* f0->cbx + \ - m[ fx->fmatx].rmux* fx->cbx* fx->cbx ); \ - en_by += 0.5 *( m[ f0->fmaty].rmuy* f0->cby* f0->cby + \ - m[ fy->fmaty].rmuy* fy->cby* fy->cby ); \ - en_bz += 0.5 *( m[ f0->fmatz].rmuz* f0->cbz* f0->cbz + \ - m[ fz->fmatz].rmuz* fz->cbz* fz->cbz ) +#define REDUCE_EN() \ + en_ex += 0.25 * ( m[f0->ematx].epsx * f0->ex * f0->ex + \ + m[fy->ematx].epsx * fy->ex * fy->ex + \ + m[fz->ematx].epsx * fz->ex * fz->ex + \ + m[fyz->ematx].epsx * fyz->ex * fyz->ex ); \ + en_ey += 0.25 * ( m[f0->ematy].epsy * f0->ey * f0->ey + \ + m[fz->ematy].epsy * fz->ey * fz->ey + \ + m[fx->ematy].epsy * fx->ey * fx->ey + \ + m[fzx->ematy].epsy * fzx->ey * fzx->ey ); \ + en_ez += 0.25 * ( m[f0->ematz].epsz * f0->ez * f0->ez + \ + m[fx->ematz].epsz * fx->ez * fx->ez + \ + m[fy->ematz].epsz * fy->ez * fy->ez + \ + m[fxy->ematz].epsz * fxy->ez * fxy->ez ); \ + en_bx += 0.5 * ( m[f0->fmatx].rmux * f0->cbx * f0->cbx + \ + m[fx->fmatx].rmux * fx->cbx * fx->cbx ); \ + en_by += 0.5 * ( m[f0->fmaty].rmuy * f0->cby * f0->cby + \ + m[fy->fmaty].rmuy * fy->cby * fy->cby ); \ + en_bz += 0.5 * ( m[f0->fmatz].rmuz * f0->cbz * f0->cbz + \ + m[fz->fmatz].rmuz * fz->cbz * fz->cbz ) -void -energy_f_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void energy_f_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _energy_f_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h index e8c8f1ec..cc3133bc 100644 --- a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h +++ b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h @@ -2,90 +2,106 @@ #define _vacuum_advance_e_pipeline_h_ #ifndef IN_vacuum_advance_e_pipeline -#error "Only include vacuum_advance_e_pipeline.h in vacuum_advance_e_pipeline source files." +#error \ + "Only include vacuum_advance_e_pipeline.h in vacuum_advance_e_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float decayx = m->decayx, drivex = m->drivex; \ - const float decayy = m->decayy, drivey = m->drivey; \ - const float decayz = m->decayz, drivez = m->drivez; \ - const float damp = args->p->damp; \ - const float px_muz = ((nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0)*m->rmuz; \ - const float px_muy = ((nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0)*m->rmuy; \ - const float py_mux = ((ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0)*m->rmux; \ - const float py_muz = ((ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0)*m->rmuz; \ - const float pz_muy = ((nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0)*m->rmuy; \ - const float pz_mux = ((nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0)*m->rmux; \ - const float cj = g->dt/g->eps0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float decayx = m->decayx, drivex = m->drivex; \ + const float decayy = m->decayy, drivey = m->drivey; \ + const float decayz = m->decayz, drivez = m->drivez; \ + const float damp = args->p->damp; \ + const float px_muz = \ + ( ( nx > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdx : 0 ) * \ + m->rmuz; \ + const float px_muy = \ + ( ( nx > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdx : 0 ) * \ + m->rmuy; \ + const float py_mux = \ + ( ( ny > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdy : 0 ) * \ + m->rmux; \ + const float py_muz = \ + ( ( ny > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdy : 0 ) * \ + m->rmuz; \ + const float pz_muy = \ + ( ( nz > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdz : 0 ) * \ + m->rmuy; \ + const float pz_mux = \ + ( ( nz > 1 ) ? ( 1 + damp ) * g->cvac * g->dt * g->rdz : 0 ) * \ + m->rmux; \ + const float cj = g->dt / g->eps0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x-1, y, z ); \ - fy = &f( x, y-1, z ); \ - fz = &f( x, y, z-1 ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - y++; x = 2; \ - if ( y > ny ) z++; if ( y > ny ) y = 2; \ - INIT_STENCIL(); \ - } +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } -#define UPDATE_EX() \ - f0->tcax = ( py_muz * ( f0->cbz - fy->cbz ) - \ - pz_muy * ( f0->cby - fz->cby ) ) - damp * f0->tcax; \ - f0->ex = decayx * f0->ex + drivex * ( f0->tcax - cj * f0->jfx ) +#define UPDATE_EX() \ + f0->tcax = \ + ( py_muz * ( f0->cbz - fy->cbz ) - pz_muy * ( f0->cby - fz->cby ) ) - \ + damp * f0->tcax; \ + f0->ex = decayx * f0->ex + drivex * ( f0->tcax - cj * f0->jfx ) -#define UPDATE_EY() \ - f0->tcay = ( pz_mux * ( f0->cbx - fz->cbx ) - \ - px_muz * ( f0->cbz - fx->cbz ) ) - damp * f0->tcay; \ - f0->ey = decayy * f0->ey + drivey * ( f0->tcay - cj * f0->jfy ) +#define UPDATE_EY() \ + f0->tcay = \ + ( pz_mux * ( f0->cbx - fz->cbx ) - px_muz * ( f0->cbz - fx->cbz ) ) - \ + damp * f0->tcay; \ + f0->ey = decayy * f0->ey + drivey * ( f0->tcay - cj * f0->jfy ) -#define UPDATE_EZ() \ - f0->tcaz = ( px_muy * ( f0->cby - fx->cby ) - \ - py_mux * ( f0->cbx - fy->cbx ) ) - damp * f0->tcaz; \ - f0->ez = decayz * f0->ez + drivez * ( f0->tcaz - cj * f0->jfz ) +#define UPDATE_EZ() \ + f0->tcaz = \ + ( px_muy * ( f0->cby - fx->cby ) - py_mux * ( f0->cbx - fy->cbx ) ) - \ + damp * f0->tcaz; \ + f0->ez = decayz * f0->ez + drivez * ( f0->tcaz - cj * f0->jfz ) -void -vacuum_advance_e_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_advance_e_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -vacuum_advance_e_pipeline_v4( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_advance_e_pipeline_v4( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -vacuum_advance_e_pipeline_v8( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_advance_e_pipeline_v8( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -vacuum_advance_e_pipeline_v16( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_advance_e_pipeline_v16( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _vacuum_advance_e_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h index 4d85ba2a..e448eb6c 100644 --- a/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h +++ b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h @@ -2,60 +2,69 @@ #define _vacuum_clean_div_e_pipeline_h_ #ifndef IN_vacuum_clean_div_e_pipeline -#error "Only include vacuum_clean_div_e_pipeline.h in vacuum_clean_div_e_pipeline source files." +#error \ + "Only include vacuum_clean_div_e_pipeline.h in vacuum_clean_div_e_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float _rdx = (nx>1) ? g->rdx : 0; \ - const float _rdy = (ny>1) ? g->rdy : 0; \ - const float _rdz = (nz>1) ? g->rdz : 0; \ - const float alphadt = 0.3888889/( _rdx*_rdx + _rdy*_rdy + _rdz*_rdz ); \ - const float px = (alphadt*_rdx)*m->drivex; \ - const float py = (alphadt*_rdy)*m->drivey; \ - const float pz = (alphadt*_rdz)*m->drivez; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z - -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] - -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x+1, y, z ); \ - fy = &f( x, y+1, z ); \ - fz = &f( x, y, z+1 ) - -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - y++; x = 1; \ - if ( y > ny ) z++; if ( y > ny ) y = 1; \ - INIT_STENCIL(); \ - } +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float _rdx = ( nx > 1 ) ? g->rdx : 0; \ + const float _rdy = ( ny > 1 ) ? g->rdy : 0; \ + const float _rdz = ( nz > 1 ) ? g->rdz : 0; \ + const float alphadt = \ + 0.3888889 / ( _rdx * _rdx + _rdy * _rdy + _rdz * _rdz ); \ + const float px = ( alphadt * _rdx ) * m->drivex; \ + const float py = ( alphadt * _rdy ) * m->drivey; \ + const float pz = ( alphadt * _rdz ) * m->drivez; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z + +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] + +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x + 1, y, z ); \ + fy = &f( x, y + 1, z ); \ + fz = &f( x, y, z + 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + y++; \ + x = 1; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 1; \ + INIT_STENCIL(); \ + } #define MARDER_EX() f0->ex += px * ( fx->div_e_err - f0->div_e_err ) #define MARDER_EY() f0->ey += py * ( fy->div_e_err - f0->div_e_err ) #define MARDER_EZ() f0->ez += pz * ( fz->div_e_err - f0->div_e_err ) -static void -vacuum_clean_div_e_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +static void vacuum_clean_div_e_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, + int n_pipeline ); #endif // _vacuum_clean_div_e_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h index ca78b79d..9d388f6e 100644 --- a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h +++ b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h @@ -2,79 +2,89 @@ #define _vacuum_compute_curl_b_pipeline_h_ #ifndef IN_vacuum_compute_curl_b_pipeline -#error "Only include vacuum_compute_curl_b_pipeline.h in vacuum_compute_curl_b_pipeline source files." +#error \ + "Only include vacuum_compute_curl_b_pipeline.h in vacuum_compute_curl_b_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float px_muz = ((nx>1) ? g->cvac*g->dt*g->rdx : 0)*m->rmuz; \ - const float px_muy = ((nx>1) ? g->cvac*g->dt*g->rdx : 0)*m->rmuy; \ - const float py_mux = ((ny>1) ? g->cvac*g->dt*g->rdy : 0)*m->rmux; \ - const float py_muz = ((ny>1) ? g->cvac*g->dt*g->rdy : 0)*m->rmuz; \ - const float pz_muy = ((nz>1) ? g->cvac*g->dt*g->rdz : 0)*m->rmuy; \ - const float pz_mux = ((nz>1) ? g->cvac*g->dt*g->rdz : 0)*m->rmux; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z +#define DECLARE_STENCIL() \ + field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float px_muz = \ + ( ( nx > 1 ) ? g->cvac * g->dt * g->rdx : 0 ) * m->rmuz; \ + const float px_muy = \ + ( ( nx > 1 ) ? g->cvac * g->dt * g->rdx : 0 ) * m->rmuy; \ + const float py_mux = \ + ( ( ny > 1 ) ? g->cvac * g->dt * g->rdy : 0 ) * m->rmux; \ + const float py_muz = \ + ( ( ny > 1 ) ? g->cvac * g->dt * g->rdy : 0 ) * m->rmuz; \ + const float pz_muy = \ + ( ( nz > 1 ) ? g->cvac * g->dt * g->rdz : 0 ) * m->rmuy; \ + const float pz_mux = \ + ( ( nz > 1 ) ? g->cvac * g->dt * g->rdz : 0 ) * m->rmux; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x-1, y, z ); \ - fy = &f( x, y-1, z ); \ - fz = &f( x, y, z-1 ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - y++; x = 2; \ - if ( y > ny ) z++; if ( y > ny ) y = 2; \ - INIT_STENCIL(); \ - } +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } -#define UPDATE_EX() f0->tcax = ( py_muz * ( f0->cbz - fy->cbz ) - \ - pz_muy * ( f0->cby - fz->cby ) ) +#define UPDATE_EX() \ + f0->tcax = \ + ( py_muz * ( f0->cbz - fy->cbz ) - pz_muy * ( f0->cby - fz->cby ) ) -#define UPDATE_EY() f0->tcay = ( pz_mux * ( f0->cbx - fz->cbx ) - \ - px_muz * ( f0->cbz - fx->cbz ) ) +#define UPDATE_EY() \ + f0->tcay = \ + ( pz_mux * ( f0->cbx - fz->cbx ) - px_muz * ( f0->cbz - fx->cbz ) ) -#define UPDATE_EZ() f0->tcaz = ( px_muy * ( f0->cby - fx->cby ) - \ - py_mux * ( f0->cbx - fy->cbx ) ) +#define UPDATE_EZ() \ + f0->tcaz = \ + ( px_muy * ( f0->cby - fx->cby ) - py_mux * ( f0->cbx - fy->cbx ) ) -void -vacuum_compute_curl_b_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_compute_curl_b_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -vacuum_compute_curl_b_pipeline_v4( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_compute_curl_b_pipeline_v4( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -vacuum_compute_curl_b_pipeline_v8( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_compute_curl_b_pipeline_v8( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -vacuum_compute_curl_b_pipeline_v16( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_compute_curl_b_pipeline_v16( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); #endif // _vacuum_compute_curl_b_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h index a31ecfee..9d2b34da 100644 --- a/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h +++ b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h @@ -2,59 +2,67 @@ #define _vacuum_compute_div_e_err_pipeline_h_ #ifndef IN_vacuum_compute_div_e_err_pipeline -#error "Only include vacuum_compute_div_e_err_pipeline.h in vacuum_compute_div_e_err_pipeline source files." +#error \ + "Only include vacuum_compute_div_e_err_pipeline.h in vacuum_compute_div_e_err_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - /**/ field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + /**/ field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - /**/ field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float nc = m->nonconductive; \ - const float px = ((nx>1) ? g->rdx : 0)*m->epsx; \ - const float py = ((ny>1) ? g->rdy : 0)*m->epsy; \ - const float pz = ((nz>1) ? g->rdz : 0)*m->epsz; \ - const float cj = 1./g->eps0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z - -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] - -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x-1, y, z ); \ - fy = &f( x, y-1, z ); \ - fz = &f( x, y, z-1 ) - -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - /**/ y++; x = 2; \ - if ( y > ny ) z++; if ( y > ny ) y = 2; \ - INIT_STENCIL(); \ - } - -#define UPDATE_DERR_E() f0->div_e_err = nc * ( px * ( f0->ex - fx->ex ) + \ - py * ( f0->ey - fy->ey ) + \ - pz * ( f0->ez - fz->ez ) - \ - cj * ( f0->rhof + f0->rhob ) ) - -void -vacuum_compute_div_e_err_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +#define DECLARE_STENCIL() \ + /**/ field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float nc = m->nonconductive; \ + const float px = ( ( nx > 1 ) ? g->rdx : 0 ) * m->epsx; \ + const float py = ( ( ny > 1 ) ? g->rdy : 0 ) * m->epsy; \ + const float pz = ( ( nz > 1 ) ? g->rdz : 0 ) * m->epsz; \ + const float cj = 1. / g->eps0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z + +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] + +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } + +#define UPDATE_DERR_E() \ + f0->div_e_err = \ + nc * ( px * ( f0->ex - fx->ex ) + py * ( f0->ey - fy->ey ) + \ + pz * ( f0->ez - fz->ez ) - cj * ( f0->rhof + f0->rhob ) ) + +void vacuum_compute_div_e_err_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, + int n_pipeline ); #endif // _vacuum_compute_div_e_err_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h index 066b31f3..2bedf8a6 100644 --- a/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h +++ b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h @@ -2,57 +2,64 @@ #define _vacuum_compute_rhob_pipeline_h_ #ifndef IN_vacuum_compute_rhob_pipeline -#error "Only include vacuum_compute_rhob_pipeline.h in vacuum_compute_rhob_pipeline source files." +#error \ + "Only include vacuum_compute_rhob_pipeline.h in vacuum_compute_rhob_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - /**/ field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; + /**/ field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; } pipeline_args_t; -#define DECLARE_STENCIL() \ - /**/ field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float nc = m->nonconductive; \ - const float px = (nx>1) ? g->eps0*m->epsx*g->rdx : 0; \ - const float py = (ny>1) ? g->eps0*m->epsy*g->rdy : 0; \ - const float pz = (nz>1) ? g->eps0*m->epsz*g->rdz : 0; \ - \ - field_t * ALIGNED(16) f0; \ - field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - int x, y, z - -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] - -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x-1, y, z ); \ - fy = &f( x, y-1, z ); \ - fz = &f( x, y, z-1 ) - -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; x++; \ - if ( x > nx ) \ - { \ - /**/ y++; x = 2; \ - if ( y > ny ) z++; if ( y > ny ) y = 2; \ - INIT_STENCIL(); \ - } - -#define UPDATE_DERR_E() f0->rhob = nc * ( px * ( f0->ex - fx->ex ) + \ - py * ( f0->ey - fy->ey ) + \ - pz * ( f0->ez - fz->ez ) - f0->rhof ) - -void -vacuum_compute_rhob_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +#define DECLARE_STENCIL() \ + /**/ field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float nc = m->nonconductive; \ + const float px = ( nx > 1 ) ? g->eps0 * m->epsx * g->rdx : 0; \ + const float py = ( ny > 1 ) ? g->eps0 * m->epsy * g->rdy : 0; \ + const float pz = ( nz > 1 ) ? g->eps0 * m->epsz * g->rdz : 0; \ + \ + field_t* ALIGNED( 16 ) f0; \ + field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + int x, y, z + +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] + +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x - 1, y, z ); \ + fy = &f( x, y - 1, z ); \ + fz = &f( x, y, z - 1 ) + +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 2; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 2; \ + INIT_STENCIL(); \ + } + +#define UPDATE_DERR_E() \ + f0->rhob = nc * ( px * ( f0->ex - fx->ex ) + py * ( f0->ey - fy->ey ) + \ + pz * ( f0->ez - fz->ez ) - f0->rhof ) + +void vacuum_compute_rhob_pipeline_scalar( pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); #endif // _vacuum_compute_rhob_pipeline_h_ diff --git a/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h index 3efc067e..47748818 100644 --- a/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h +++ b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h @@ -2,81 +2,82 @@ #define _vacuum_energy_f_pipeline_h_ #ifndef IN_vacuum_energy_f_pipeline -#error "Only include vacuum_energy_f_pipeline.h in vacuum_energy_f_pipeline source files." +#error \ + "Only include vacuum_energy_f_pipeline.h in vacuum_energy_f_pipeline source files." #endif #include "../sfa_private.h" typedef struct pipeline_args { - const field_t * ALIGNED(128) f; - const sfa_params_t * p; - const grid_t * g; - double en[MAX_PIPELINE+1][6]; + const field_t* ALIGNED( 128 ) f; + const sfa_params_t* p; + const grid_t* g; + double en[MAX_PIPELINE + 1][6]; } pipeline_args_t; -#define DECLARE_STENCIL() \ - const field_t * ALIGNED(128) f = args->f; \ - const material_coefficient_t * ALIGNED(128) m = args->p->mc; \ - const grid_t * g = args->g; \ - const int nx = g->nx, ny = g->ny, nz = g->nz; \ - \ - const float qepsx = 0.25*m->epsx; \ - const float qepsy = 0.25*m->epsy; \ - const float qepsz = 0.25*m->epsz; \ - const float hrmux = 0.50*m->rmux; /* was previously 0.25 in master */ \ - const float hrmuy = 0.50*m->rmuy; /* was previously 0.25 in master */ \ - const float hrmuz = 0.50*m->rmuz; /* was previously 0.25 in master */ \ - \ - const field_t * ALIGNED(16) f0; \ - const field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \ - const field_t * ALIGNED(16) fyz, * ALIGNED(16) fzx, * ALIGNED(16) fxy; \ - double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \ - int x, y, z +#define DECLARE_STENCIL() \ + const field_t* ALIGNED( 128 ) f = args->f; \ + const material_coefficient_t* ALIGNED( 128 ) m = args->p->mc; \ + const grid_t* g = args->g; \ + const int nx = g->nx, ny = g->ny, nz = g->nz; \ + \ + const float qepsx = 0.25 * m->epsx; \ + const float qepsy = 0.25 * m->epsy; \ + const float qepsz = 0.25 * m->epsz; \ + const float hrmux = 0.50 * m->rmux; /* was previously 0.25 in master */ \ + const float hrmuy = 0.50 * m->rmuy; /* was previously 0.25 in master */ \ + const float hrmuz = 0.50 * m->rmuz; /* was previously 0.25 in master */ \ + \ + const field_t* ALIGNED( 16 ) f0; \ + const field_t *ALIGNED( 16 ) fx, *ALIGNED( 16 ) fy, *ALIGNED( 16 ) fz; \ + const field_t *ALIGNED( 16 ) fyz, *ALIGNED( 16 ) fzx, *ALIGNED( 16 ) fxy; \ + double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \ + int x, y, z -#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ] +#define f( x, y, z ) f[VOXEL( x, y, z, nx, ny, nz )] -#define INIT_STENCIL() \ - f0 = &f( x, y, z ); \ - fx = &f( x+1, y, z ); \ - fy = &f( x, y+1, z ); \ - fz = &f( x, y, z+1 ); \ - fyz = &f( x, y+1, z+1 ); \ - fzx = &f( x+1, y, z+1 ); \ - fxy = &f( x+1, y+1, z ) +#define INIT_STENCIL() \ + f0 = &f( x, y, z ); \ + fx = &f( x + 1, y, z ); \ + fy = &f( x, y + 1, z ); \ + fz = &f( x, y, z + 1 ); \ + fyz = &f( x, y + 1, z + 1 ); \ + fzx = &f( x + 1, y, z + 1 ); \ + fxy = &f( x + 1, y + 1, z ) -#define NEXT_STENCIL() \ - f0++; fx++; fy++; fz++; fyz++; fzx++; fxy++; x++; \ - if ( x > nx ) \ - { \ - /**/ y++; x = 1; \ - if ( y > ny ) z++; if ( y > ny ) y = 1; \ - INIT_STENCIL(); \ - } +#define NEXT_STENCIL() \ + f0++; \ + fx++; \ + fy++; \ + fz++; \ + fyz++; \ + fzx++; \ + fxy++; \ + x++; \ + if ( x > nx ) \ + { \ + /**/ y++; \ + x = 1; \ + if ( y > ny ) \ + z++; \ + if ( y > ny ) \ + y = 1; \ + INIT_STENCIL(); \ + } -#define REDUCE_EN() \ - en_ex += qepsx * ( f0->ex * f0->ex + \ - fy->ex * fy->ex + \ - fz->ex * fz->ex + \ - fyz->ex * fyz->ex ); \ - en_ey += qepsy * ( f0->ey * f0->ey + \ - fz->ey * fz->ey + \ - fx->ey * fx->ey + \ - fzx->ey * fzx->ey ); \ - en_ez += qepsz * ( f0->ez * f0->ez + \ - fx->ez * fx->ez + \ - fy->ez * fy->ez + \ - fxy->ez * fxy->ez ); \ - en_bx += hrmux * ( f0->cbx * f0->cbx + \ - fx->cbx * fx->cbx ); \ - en_by += hrmuy * ( f0->cby * f0->cby + \ - fy->cby * fy->cby ); \ - en_bz += hrmuz * ( f0->cbz * f0->cbz + \ - fz->cbz * fz->cbz ) +#define REDUCE_EN() \ + en_ex += qepsx * ( f0->ex * f0->ex + fy->ex * fy->ex + fz->ex * fz->ex + \ + fyz->ex * fyz->ex ); \ + en_ey += qepsy * ( f0->ey * f0->ey + fz->ey * fz->ey + fx->ey * fx->ey + \ + fzx->ey * fzx->ey ); \ + en_ez += qepsz * ( f0->ez * f0->ez + fx->ez * fx->ez + fy->ez * fy->ez + \ + fxy->ez * fxy->ez ); \ + en_bx += hrmux * ( f0->cbx * f0->cbx + fx->cbx * fx->cbx ); \ + en_by += hrmuy * ( f0->cby * f0->cby + fy->cby * fy->cby ); \ + en_bz += hrmuz * ( f0->cbz * f0->cbz + fz->cbz * fz->cbz ) -void -vacuum_energy_f_pipeline_scalar( pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void vacuum_energy_f_pipeline_scalar( pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _vacuum_energy_f_pipeline_h_ diff --git a/src/field_advance/standard/sfa_private.h b/src/field_advance/standard/sfa_private.h index 5cad3aa6..6a125556 100644 --- a/src/field_advance/standard/sfa_private.h +++ b/src/field_advance/standard/sfa_private.h @@ -13,47 +13,40 @@ typedef struct material_coefficient { - float decayx, drivex; // Decay of ex and drive of (curl H)x and Jx - float decayy, drivey; // Decay of ey and drive of (curl H)y and Jy - float decayz, drivez; // Decay of ez and drive of (curl H)z and Jz - float rmux, rmuy, rmuz; // Reciprocle of relative permeability - float nonconductive; // Divergence cleaning related coefficients - float epsx, epsy, epsz; - float pad[3]; // For 64-byte alignment and future expansion + float decayx, drivex; // Decay of ex and drive of (curl H)x and Jx + float decayy, drivey; // Decay of ey and drive of (curl H)y and Jy + float decayz, drivez; // Decay of ez and drive of (curl H)z and Jz + float rmux, rmuy, rmuz; // Reciprocle of relative permeability + float nonconductive; // Divergence cleaning related coefficients + float epsx, epsy, epsz; + float pad[3]; // For 64-byte alignment and future expansion } material_coefficient_t; typedef struct sfa_params { - material_coefficient_t * mc; - int n_mc; - float damp; + material_coefficient_t* mc; + int n_mc; + float damp; } sfa_params_t; BEGIN_C_DECLS // In standard_field_advance.c -void -delete_standard_field_array( field_array_t * RESTRICT fa ); +void delete_standard_field_array( field_array_t* RESTRICT fa ); -void -clear_jf( field_array_t * RESTRICT fa ); +void clear_jf( field_array_t* RESTRICT fa ); -void -clear_rhof( field_array_t * RESTRICT fa ); +void clear_rhof( field_array_t* RESTRICT fa ); // In advance_b.c // advance_b applies the following difference equation to the fields: // c B_new = c B_old - frac c dt curl E -void -advance_b( field_array_t * RESTRICT fa, - float frac ); +void advance_b( field_array_t* RESTRICT fa, float frac ); -void -advance_b_pipeline( field_array_t * RESTRICT fa, - float _frac ); +void advance_b_pipeline( field_array_t* RESTRICT fa, float _frac ); // In advance_e.c @@ -61,7 +54,7 @@ advance_b_pipeline( field_array_t * RESTRICT fa, // tca_new = ( 1 + damp ) c dt curl ( c B / mu_r ) - // damp tca_old // E_new = decay E_old + drive [ tca_new - (dt/eps0) Jf ] -// where: +// where: // damp is numerical Cherenkov damping parameter // decay = exp( -alpha ) // drive = ( 1 - decay ) / ( alpha eps_r ) @@ -74,21 +67,13 @@ advance_b_pipeline( field_array_t * RESTRICT fa, // // FIXME: Currently, frac must be 1. -void -advance_e( field_array_t * RESTRICT fa, - float frac ); +void advance_e( field_array_t* RESTRICT fa, float frac ); -void -advance_e_pipeline( field_array_t * RESTRICT fa, - float frac ); +void advance_e_pipeline( field_array_t* RESTRICT fa, float frac ); -void -vacuum_advance_e( field_array_t * RESTRICT fa, - float frac ); +void vacuum_advance_e( field_array_t* RESTRICT fa, float frac ); -void -vacuum_advance_e_pipeline( field_array_t * RESTRICT fa, - float frac ); +void vacuum_advance_e_pipeline( field_array_t* RESTRICT fa, float frac ); // In energy_f.c @@ -105,21 +90,16 @@ vacuum_advance_e_pipeline( field_array_t * RESTRICT fa, // // vacuum_energy_f is the high performance version for uniform regions -void -energy_f( double * RESTRICT en, // 6 elem array - const field_array_t * RESTRICT fa ); +void energy_f( double* RESTRICT en, // 6 elem array + const field_array_t* RESTRICT fa ); -void -energy_f_pipeline( double * global, - const field_array_t * RESTRICT fa ); +void energy_f_pipeline( double* global, const field_array_t* RESTRICT fa ); -void -vacuum_energy_f( double * RESTRICT en, // 6 elem array - const field_array_t * RESTRICT fa ); +void vacuum_energy_f( double* RESTRICT en, // 6 elem array + const field_array_t* RESTRICT fa ); -void -vacuum_energy_f_pipeline( double * global, - const field_array_t * RESTRICT fa ); +void vacuum_energy_f_pipeline( double* global, + const field_array_t* RESTRICT fa ); // In compute_curl_b.c @@ -131,17 +111,13 @@ vacuum_energy_f_pipeline( double * global, // // vacuum_compute_curl_b is the high performance version for uniform regions -void -compute_curl_b( field_array_t * RESTRICT fa ); +void compute_curl_b( field_array_t* RESTRICT fa ); -void -compute_curl_b_pipeline( field_array_t * RESTRICT fa ); +void compute_curl_b_pipeline( field_array_t* RESTRICT fa ); -void -vacuum_compute_curl_b( field_array_t * RESTRICT fa ); +void vacuum_compute_curl_b( field_array_t* RESTRICT fa ); -void -vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); +void vacuum_compute_curl_b_pipeline( field_array_t* RESTRICT fa ); // The theory behind the Marder correction is that the Ampere and // Faraday equations can be modified as follows: @@ -151,7 +127,7 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // p(div B)/pt = alpha laplacian div B // p(div D-rho)/pt = alpha laplacian ( div D - rho ) // Since these are sourceless diffusion equation, asymptotically, -// div B --> 0 +// div B --> 0 // div D - rho --> 0 // In particular, Fourier transforming div B in space shows that a // given mode decays as exp(-alpha k^2 t). The diffusion coefficient @@ -167,7 +143,7 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // do not change _any_ physics. Further, if for any reason a non-zero // div B or (div D - rho) occurs, the above modification will drive // the error back to zero. -// +// // To understand how use this in a simulation, consider the standard // field update equations for Bx on a Yee mesh without the additional // term: @@ -181,7 +157,7 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // error in cBx for an arbitrary grid point will be closely // approximated by a Gaussian with zero mean and standard deviation // ~0.5 eps |cBx| sqrt(Nt). The same holds true for cBy and cBz. -// +// // If it is assumed that the errors between different grid points are // uncorrelated (a _very_ accurate assumption except for very // specially prepared field configurations), then the power in various @@ -194,11 +170,11 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // using forward differencing in time (this is the usual Marder pass // ... strictly local operations, easy and efficient to implement in // parallel): -// cBx(1/2)_clean = cBx(1/2)_unclean + +// cBx(1/2)_clean = cBx(1/2)_unclean + // alpha dt grad div cBx(1/2)_unclean // The power in various modes of cBx(1/2)_clean can be shown to be: // |div cB(kx,ky,kz)_clean|^2 ~ -// |div cB(kx,ky,kz)_unclean|^2 +// |div cB(kx,ky,kz)_unclean|^2 // { 1 - (4*alpha*dt/dg^2) [ (dg sin(pi kx/Nx)/dx)^2 + // (dg sin(pi ky/Ny)/dy)^2 + // (dg sin(pi kz/Nz)/dz)^2 ] }^2 @@ -208,11 +184,11 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // of div cB(kx,ky,kz) grows and the divergence cleaning pass is // numerically stable. Note: This is the same stability criterion as // the forward differenced diffusion equation. -// +// // If alpha dt = dg^2/4, then shortest wavelength component of div cB // will be zeroed. Since this is where most of the divergence errors // are located, this is a relatively good choice. -// +// // If we want to minimize the total RMS divergence error, it can be // shown (using Parseval's theorem) that the best choice of alpha dt // on large cubic periodic meshes is: @@ -220,7 +196,7 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // This value is pretty close to optimal on other meshes also. Using // this value will take the total RMS divergence error to ~0.304 of // the original value. -// +// // If we assume future contributions to the divergence error are // uncorrelated with previous contributions (a very accurate // assumption) and we are only going to clean every Nc time steps, @@ -252,7 +228,7 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // gives the complete modified Marder pass: // E_clean = E_unclean + // drive alpha dt grad nonconductive (div epsr E - rho/eps0) - + // In compute_rhob.c // compute_rhob applies the following difference equation: @@ -265,17 +241,13 @@ vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa ); // // vacuum_compute_rhob is the high performance version for uniform regions -void -compute_rhob( field_array_t * RESTRICT fa ); +void compute_rhob( field_array_t* RESTRICT fa ); -void -compute_rhob_pipeline( field_array_t * RESTRICT fa ); +void compute_rhob_pipeline( field_array_t* RESTRICT fa ); -void -vacuum_compute_rhob( field_array_t * RESTRICT fa ); +void vacuum_compute_rhob( field_array_t* RESTRICT fa ); -void -vacuum_compute_rhob_pipeline( field_array_t * RESTRICT fa ); +void vacuum_compute_rhob_pipeline( field_array_t* RESTRICT fa ); // In compute_div_e_err.c @@ -286,17 +258,13 @@ vacuum_compute_rhob_pipeline( field_array_t * RESTRICT fa ); // // vacuum_compute_div_e_err is the high performance version for uniform regions -void -compute_div_e_err( field_array_t * RESTRICT fa ); +void compute_div_e_err( field_array_t* RESTRICT fa ); -void -compute_div_e_err_pipeline( field_array_t * RESTRICT fa ); +void compute_div_e_err_pipeline( field_array_t* RESTRICT fa ); -void -vacuum_compute_div_e_err( field_array_t * RESTRICT fa ); +void vacuum_compute_div_e_err( field_array_t* RESTRICT fa ); -void -vacuum_compute_div_e_err_pipeline( field_array_t * RESTRICT fa ); +void vacuum_compute_div_e_err_pipeline( field_array_t* RESTRICT fa ); // In compute_rms_div_e_err.c @@ -307,11 +275,9 @@ vacuum_compute_div_e_err_pipeline( field_array_t * RESTRICT fa ); // domains. Every processor gets the same value. Note that this // function does _not_ update or recompute div_e_err. -double -compute_rms_div_e_err( const field_array_t * RESTRICT fa ); +double compute_rms_div_e_err( const field_array_t* RESTRICT fa ); -double -compute_rms_div_e_err_pipeline( const field_array_t * RESTRICT fa ); +double compute_rms_div_e_err_pipeline( const field_array_t* RESTRICT fa ); // In clean_div_e.c @@ -321,28 +287,22 @@ compute_rms_div_e_err_pipeline( const field_array_t * RESTRICT fa ); // // vacuum_clean_div_e is the high performance version for uniform regions -void -clean_div_e( field_array_t * RESTRICT fa ); +void clean_div_e( field_array_t* RESTRICT fa ); -void -clean_div_e_pipeline( field_array_t * fa ); +void clean_div_e_pipeline( field_array_t* fa ); -void -vacuum_clean_div_e( field_array_t * RESTRICT fa ); +void vacuum_clean_div_e( field_array_t* RESTRICT fa ); -void -vacuum_clean_div_e_pipeline( field_array_t * RESTRICT fa ); +void vacuum_clean_div_e_pipeline( field_array_t* RESTRICT fa ); // In compute_div_b_err.c // compute_div_b_err applies the following difference equation: // div_b_err = div cB -void -compute_div_b_err( field_array_t * RESTRICT fa ); +void compute_div_b_err( field_array_t* RESTRICT fa ); -void -compute_div_b_err_pipeline( field_array_t * RESTRICT fa ); +void compute_div_b_err_pipeline( field_array_t* RESTRICT fa ); // In compute_rms_div_b_err.c @@ -354,11 +314,9 @@ compute_div_b_err_pipeline( field_array_t * RESTRICT fa ); // value. Uses the value of div_b_err already stored. It _does_ // _not_ recompute div_b_err. -double -compute_rms_div_b_err( const field_array_t * RESTRICT fa ); +double compute_rms_div_b_err( const field_array_t* RESTRICT fa ); -double -compute_rms_div_b_err_pipeline( const field_array_t * RESTRICT fa ); +double compute_rms_div_b_err_pipeline( const field_array_t* RESTRICT fa ); // In clean_div_b.c @@ -366,88 +324,53 @@ compute_rms_div_b_err_pipeline( const field_array_t * RESTRICT fa ); // cB_new = cB_old + alpha dt grad div_b_err // alpha is picked to rapidly reduce the rms_div_b_err -void -clean_div_b( field_array_t * RESTRICT fa ); +void clean_div_b( field_array_t* RESTRICT fa ); -void -clean_div_b_pipeline( field_array_t * RESTRICT fa ); +void clean_div_b_pipeline( field_array_t* RESTRICT fa ); // Internode functions // In remote.c -double -synchronize_tang_e_norm_b( field_array_t * RESTRICT fa ); +double synchronize_tang_e_norm_b( field_array_t* RESTRICT fa ); -void -synchronize_jf( field_array_t * RESTRICT fa ); +void synchronize_jf( field_array_t* RESTRICT fa ); -void -synchronize_rho( field_array_t * RESTRICT fa ); +void synchronize_rho( field_array_t* RESTRICT fa ); // In local.c -void -local_ghost_tang_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_ghost_tang_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_ghost_norm_e( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_ghost_norm_e( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_ghost_div_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_ghost_div_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_adjust_tang_e( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_adjust_tang_e( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_adjust_div_e( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_adjust_div_e( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_adjust_norm_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_adjust_norm_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_adjust_jf( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_adjust_jf( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_adjust_rhof( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_adjust_rhof( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -local_adjust_rhob( field_t * ALIGNED(128) f, - const grid_t * g ); +void local_adjust_rhob( field_t* ALIGNED( 128 ) f, const grid_t* g ); // In remote.c -void -begin_remote_ghost_tang_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void begin_remote_ghost_tang_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -end_remote_ghost_tang_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void end_remote_ghost_tang_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -begin_remote_ghost_norm_e( field_t * ALIGNED(128) f, - const grid_t * g ); +void begin_remote_ghost_norm_e( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -end_remote_ghost_norm_e( field_t * ALIGNED(128) f, - const grid_t * g ); +void end_remote_ghost_norm_e( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -begin_remote_ghost_div_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void begin_remote_ghost_div_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); -void -end_remote_ghost_div_b( field_t * ALIGNED(128) f, - const grid_t * g ); +void end_remote_ghost_div_b( field_t* ALIGNED( 128 ) f, const grid_t* g ); END_C_DECLS diff --git a/src/grid/grid.h b/src/grid/grid.h index 7654fe94..7a18a26f 100644 --- a/src/grid/grid.h +++ b/src/grid/grid.h @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -13,127 +13,131 @@ #include "../util/util.h" -#define BOUNDARY(i,j,k) (13+(i)+3*(j)+9*(k)) /* FORTRAN -1:1,-1:1,-1:1 */ - -enum grid_enums { - - // Phase 2 boundary conditions - anti_symmetric_fields = -1, // E_tang = 0 - pec_fields = -1, - metal_fields = -1, - symmetric_fields = -2, // B_tang = 0, B_norm = 0 - pmc_fields = -3, // B_tang = 0, B_norm floats - absorb_fields = -4, // Gamma = 0 - - // Phase 3 boundary conditions - reflect_particles = -1, // Cell boundary should reflect particles - absorb_particles = -2 // Cell boundary should absorb particles - - // Symmetry in the field boundary conditions refers to image charge - // sign - // - // Anti-symmetric -> Image charges are opposite signed (ideal metal) - // Boundary rho/j are accumulated over partial voxel+image - // Symmetric -> Image charges are same signed (symmetry plane or pmc) - // Boundary rho/j are accumulated over partial voxel+image - // Absorbing -> No image charges - // Boundary rho/j are accumulated over partial voxel only - // - // rho -> Anti-symmetric | rho -> Symmetric - // jf_tang -> Anti-symmetric | jf_tang -> Symmetric - // E_tang -> Anti-symmetric | E_tang -> Symmetric - // B_norm -> Anti-symmetric + DC | B_norm -> Symmetric (see note) - // B_tang -> Symmetric | B_tang -> Anti-symmetric - // E_norm -> Symmetric | E_norm -> Anti-symmetric (see note) - // div B -> Symmetric | div B -> Anti-symmetric - // - // Note: B_norm is tricky. For a symmetry plane, B_norm on the - // boundary must be zero as there are no magnetic charges (a - // non-zero B_norm would imply an infinitesimal layer of magnetic - // charge). However, if a symmetric boundary is interpreted as a - // perfect magnetic conductor, B_norm could be present due to - // magnetic conduction surface charges. Even though there are no - // bulk volumetric magnetic charges to induce a surface magnetic - // charge, I think that radiation/waveguide modes/etc could (the - // total surface magnetic charge in the simulation would be zero - // though). As a result, symmetric and pmc boundary conditions are - // treated separately. Symmetric and pmc boundaries are identical - // except the symmetric boundaries explicitly zero boundary - // B_norm. Note: anti-symmetric and pec boundary conditions would - // have the same issue if norm E was located directly on the - // boundary. However, it is not so this problem does not arise. - // - // Note: Absorbing boundary conditions make no effort to clean - // divergence errors on them. They assume that the ghost div b is - // zero and force the surface div e on them to be zero. This means - // ghost norm e can be set to any value on absorbing boundaries. +#define BOUNDARY( i, j, k ) \ + ( 13 + ( i ) + 3 * ( j ) + 9 * ( k ) ) /* FORTRAN -1:1,-1:1,-1:1 */ + +enum grid_enums +{ + + // Phase 2 boundary conditions + anti_symmetric_fields = -1, // E_tang = 0 + pec_fields = -1, + metal_fields = -1, + symmetric_fields = -2, // B_tang = 0, B_norm = 0 + pmc_fields = -3, // B_tang = 0, B_norm floats + absorb_fields = -4, // Gamma = 0 + + // Phase 3 boundary conditions + reflect_particles = -1, // Cell boundary should reflect particles + absorb_particles = -2 // Cell boundary should absorb particles + + // Symmetry in the field boundary conditions refers to image charge + // sign + // + // Anti-symmetric -> Image charges are opposite signed (ideal metal) + // Boundary rho/j are accumulated over partial voxel+image + // Symmetric -> Image charges are same signed (symmetry plane or pmc) + // Boundary rho/j are accumulated over partial voxel+image + // Absorbing -> No image charges + // Boundary rho/j are accumulated over partial voxel only + // + // rho -> Anti-symmetric | rho -> Symmetric + // jf_tang -> Anti-symmetric | jf_tang -> Symmetric + // E_tang -> Anti-symmetric | E_tang -> Symmetric + // B_norm -> Anti-symmetric + DC | B_norm -> Symmetric (see note) + // B_tang -> Symmetric | B_tang -> Anti-symmetric + // E_norm -> Symmetric | E_norm -> Anti-symmetric (see note) + // div B -> Symmetric | div B -> Anti-symmetric + // + // Note: B_norm is tricky. For a symmetry plane, B_norm on the + // boundary must be zero as there are no magnetic charges (a + // non-zero B_norm would imply an infinitesimal layer of magnetic + // charge). However, if a symmetric boundary is interpreted as a + // perfect magnetic conductor, B_norm could be present due to + // magnetic conduction surface charges. Even though there are no + // bulk volumetric magnetic charges to induce a surface magnetic + // charge, I think that radiation/waveguide modes/etc could (the + // total surface magnetic charge in the simulation would be zero + // though). As a result, symmetric and pmc boundary conditions are + // treated separately. Symmetric and pmc boundaries are identical + // except the symmetric boundaries explicitly zero boundary + // B_norm. Note: anti-symmetric and pec boundary conditions would + // have the same issue if norm E was located directly on the + // boundary. However, it is not so this problem does not arise. + // + // Note: Absorbing boundary conditions make no effort to clean + // divergence errors on them. They assume that the ghost div b is + // zero and force the surface div e on them to be zero. This means + // ghost norm e can be set to any value on absorbing boundaries. }; -typedef struct grid { - - // System of units - float dt, cvac, eps0; - - // Time stepper. The simulation time is given by - // t = g->t0 + (double)g->dt*(double)g->step - int64_t step; // Current timestep - double t0; // Simulation time corresponding to step 0 - - // Phase 2 grid data structures - float x0, y0, z0; // Min corner local domain (must be coherent) - float x1, y1, z1; // Max corner local domain (must be coherent) - int nx, ny, nz; // Local voxel mesh resolution. Voxels are - // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 - // with voxels 1:nx,1:ny,1:nz being non-ghost - // voxels. - float dx, dy, dz, dV; // Cell dimensions and volume (CONVENIENCE ... - // USE x0,x1 WHEN DECIDING WHICH NODE TO USE!) - float rdx, rdy, rdz, r8V; // Inverse voxel dimensions and one over - // eight times the voxel volume (CONVENIENCE) - int sx, sy, sz, nv; // Voxel indexing x-, y-,z- strides and the - // number of local voxels (including ghosts, - // (nx+2)(ny+2)(nz+2)), (CONVENIENCE) - int bc[27]; // (-1:1,-1:1,-1:1) FORTRAN indexed array of - // boundary conditions to apply at domain edge - // 0 ... nproc-1 ... comm boundary condition - // <0 ... locally applied boundary condition - - // Phase 3 grid data structures - // NOTE: VOXEL INDEXING LIMITS NUMBER OF VOXELS TO 2^31 (INCLUDING - // GHOSTS) PER NODE. NEIGHBOR INDEXING FURTHER LIMITS TO - // (2^31)/6. BOUNDARY CONDITION HANDLING LIMITS TO 2^28 PER NODE - // EMITTER COMPONENT ID INDEXING FURTHER LIMITS TO 2^26 PER NODE. - // THE LIMIT IS 2^63 OVER ALL NODES THOUGH. - int64_t * ALIGNED(16) range; - // (0:nproc) indexed array giving range of - // global indexes of voxel owned by each - // processor. Replicated on each processor. - // (range[rank]:range[rank+1]-1) are global - // voxels owned by processor "rank". Note: - // range[rank+1]-range[rank] <~ 2^31 / 6 - - int64_t * ALIGNED(128) neighbor; - // (0:5,0:local_num_voxel-1) FORTRAN indexed - // array neighbor(0:5,lidx) are the global - // indexes of neighboring voxels of the - // voxel with local index "lidx". Negative - // if neighbor is a boundary condition. - - int64_t rangel, rangeh; // Redundant for move_p performance reasons: - // rangel = range[rank] - // rangeh = range[rank+1]-1. - // Note: rangeh-rangel <~ 2^26 - - // Nearest neighbor communications ports - mp_t * mp; +typedef struct grid +{ + + // System of units + float dt, cvac, eps0; + + // Time stepper. The simulation time is given by + // t = g->t0 + (double)g->dt*(double)g->step + int64_t step; // Current timestep + double t0; // Simulation time corresponding to step 0 + + // Phase 2 grid data structures + float x0, y0, z0; // Min corner local domain (must be coherent) + float x1, y1, z1; // Max corner local domain (must be coherent) + int nx, ny, nz; // Local voxel mesh resolution. Voxels are + // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1 + // with voxels 1:nx,1:ny,1:nz being non-ghost + // voxels. + float dx, dy, dz, dV; // Cell dimensions and volume (CONVENIENCE ... + // USE x0,x1 WHEN DECIDING WHICH NODE TO USE!) + float rdx, rdy, rdz, r8V; // Inverse voxel dimensions and one over + // eight times the voxel volume (CONVENIENCE) + int sx, sy, sz, nv; // Voxel indexing x-, y-,z- strides and the + // number of local voxels (including ghosts, + // (nx+2)(ny+2)(nz+2)), (CONVENIENCE) + int bc[27]; // (-1:1,-1:1,-1:1) FORTRAN indexed array of + // boundary conditions to apply at domain edge + // 0 ... nproc-1 ... comm boundary condition + // <0 ... locally applied boundary condition + + // Phase 3 grid data structures + // NOTE: VOXEL INDEXING LIMITS NUMBER OF VOXELS TO 2^31 (INCLUDING + // GHOSTS) PER NODE. NEIGHBOR INDEXING FURTHER LIMITS TO + // (2^31)/6. BOUNDARY CONDITION HANDLING LIMITS TO 2^28 PER NODE + // EMITTER COMPONENT ID INDEXING FURTHER LIMITS TO 2^26 PER NODE. + // THE LIMIT IS 2^63 OVER ALL NODES THOUGH. + int64_t* ALIGNED( 16 ) range; + // (0:nproc) indexed array giving range of + // global indexes of voxel owned by each + // processor. Replicated on each processor. + // (range[rank]:range[rank+1]-1) are global + // voxels owned by processor "rank". Note: + // range[rank+1]-range[rank] <~ 2^31 / 6 + + int64_t* ALIGNED( 128 ) neighbor; + // (0:5,0:local_num_voxel-1) FORTRAN indexed + // array neighbor(0:5,lidx) are the global + // indexes of neighboring voxels of the + // voxel with local index "lidx". Negative + // if neighbor is a boundary condition. + + int64_t rangel, rangeh; // Redundant for move_p performance reasons: + // rangel = range[rank] + // rangeh = range[rank+1]-1. + // Note: rangeh-rangel <~ 2^26 + + // Nearest neighbor communications ports + mp_t* mp; } grid_t; // Given a voxel mesh coordinates (on 0:nx+1,0:ny+1,0:nz+1) and // voxel mesh resolution (nx,ny,nz), return the index of that voxel. -#define VOXEL(x,y,z, nx,ny,nz) ((x) + ((nx)+2)*((y) + ((ny)+2)*(z))) +#define VOXEL( x, y, z, nx, ny, nz ) \ + ( ( x ) + ( ( nx ) + 2 ) * ( ( y ) + ( ( ny ) + 2 ) * ( z ) ) ) // Advance the voxel mesh index (v) and corresponding voxel mesh // coordinates (x,y,z) in a region with min- and max-corners of @@ -147,43 +151,43 @@ typedef struct grid { // inner loops.) // // This is written with seeming extraneously if tests in order to get -// the compiler to generate branceless conditional move and add +// the compiler to generate branceless conditional move and add // instructions (none of the branches below are actual branches in // assembly). -#define NEXT_VOXEL(v,x,y,z, xl,xh, yl,yh, zl,zh, nx,ny,nz) \ - (v)++; \ - (x)++; \ - if( (x)>(xh) ) (v) += (nx)-(xh)+(xl)+1; \ - if( (x)>(xh) ) (y)++; \ - if( (x)>(xh) ) (x) = (xl); \ - if( (y)>(yh) ) (v) += ((ny)-(yh)+(yl)+1)*((nx)+2); \ - if( (y)>(yh) ) (z)++; \ - if( (y)>(yh) ) (y) = (yl) +#define NEXT_VOXEL( v, x, y, z, xl, xh, yl, yh, zl, zh, nx, ny, nz ) \ + ( v )++; \ + ( x )++; \ + if ( ( x ) > ( xh ) ) \ + ( v ) += ( nx ) - ( xh ) + ( xl ) + 1; \ + if ( ( x ) > ( xh ) ) \ + ( y )++; \ + if ( ( x ) > ( xh ) ) \ + ( x ) = ( xl ); \ + if ( ( y ) > ( yh ) ) \ + ( v ) += ( ( ny ) - ( yh ) + ( yl ) + 1 ) * ( ( nx ) + 2 ); \ + if ( ( y ) > ( yh ) ) \ + ( z )++; \ + if ( ( y ) > ( yh ) ) \ + ( y ) = ( yl ) BEGIN_C_DECLS // In grid_structors.c -grid_t * -new_grid( void ); +grid_t* new_grid( void ); -void -delete_grid( grid_t * g ); +void delete_grid( grid_t* g ); // In ops.c -void -size_grid( grid_t * g, int lnx, int lny, int lnz ); +void size_grid( grid_t* g, int lnx, int lny, int lnz ); -void -join_grid( grid_t * g, int bound, int rank ); +void join_grid( grid_t* g, int bound, int rank ); -void -set_fbc( grid_t *g, int bound, int fbc ); +void set_fbc( grid_t* g, int bound, int fbc ); -void -set_pbc( grid_t *g, int bound, int pbc ); +void set_pbc( grid_t* g, int bound, int pbc ); // In partition.c @@ -218,27 +222,18 @@ set_pbc( grid_t *g, int bound, int pbc ); // global coordinates. Due to the vagaries of floating point, the // inverse process may not be exact. -void -partition_periodic_box( grid_t *g, - double gx0, double gy0, double gz0, - double gx1, double gy1, double gz1, - int gnx, int gny, int gnz, - int gpx, int gpy, int gpz ); - -void -partition_absorbing_box( grid_t *g, - double gx0, double gy0, double gz0, - double gx1, double gy1, double gz1, - int gnx, int gny, int gnz, - int gpx, int gpy, int gpz, - int pbc ); - -void -partition_metal_box( grid_t *g, - double gx0, double gy0, double gz0, - double gx1, double gy1, double gz1, - int gnx, int gny, int gnz, - int gpx, int gpy, int gpz ); +void partition_periodic_box( grid_t* g, double gx0, double gy0, double gz0, + double gx1, double gy1, double gz1, int gnx, + int gny, int gnz, int gpx, int gpy, int gpz ); + +void partition_absorbing_box( grid_t* g, double gx0, double gy0, double gz0, + double gx1, double gy1, double gz1, int gnx, + int gny, int gnz, int gpx, int gpy, int gpz, + int pbc ); + +void partition_metal_box( grid_t* g, double gx0, double gy0, double gz0, + double gx1, double gy1, double gz1, int gnx, int gny, + int gnz, int gpx, int gpy, int gpz ); // In grid_comm.c @@ -247,47 +242,43 @@ partition_metal_box( grid_t *g, // Start receiving a message from the node. // Only one message recv may be pending at a time on a given port. -void -begin_recv_port( int i, // x port coord ([-1,0,1]) - int j, // y port coord ([-1,0,1]) - int k, // z port coord ([-1,0,1]) - int size, // Expected size in bytes - const grid_t * g ); +void begin_recv_port( int i, // x port coord ([-1,0,1]) + int j, // y port coord ([-1,0,1]) + int k, // z port coord ([-1,0,1]) + int size, // Expected size in bytes + const grid_t* g ); // Returns pointer to the buffer that begin send will use for the next // send on the given port. The buffer is guaranteed to have enough // room for size bytes. This is only valid to call if no sends on // that port are pending. -void * ALIGNED(128) -size_send_port( int i, // x port coord ([-1,0,1]) - int j, // y port coord ([-1,0,1]) - int k, // z port coord ([-1,0,1]) - int size, // Needed send size in bytes - const grid_t * g ); +void* ALIGNED( 128 ) size_send_port( int i, // x port coord ([-1,0,1]) + int j, // y port coord ([-1,0,1]) + int k, // z port coord ([-1,0,1]) + int size, // Needed send size in bytes + const grid_t* g ); // Begin sending size bytes of the buffer out the given port. Only // one message send may be pending at a time on a given port. (FIXME: // WHAT HAPPENS IF SIZE_SEND_PORT size < begin_send_port // size??) -void -begin_send_port( int i, // x port coord ([-1,0,1]) - int j, // y port coord ([-1,0,1]) - int k, // z port coord ([-1,0,1]) - int size, // Number of bytes to send (in bytes) - const grid_t * g ); +void begin_send_port( int i, // x port coord ([-1,0,1]) + int j, // y port coord ([-1,0,1]) + int k, // z port coord ([-1,0,1]) + int size, // Number of bytes to send (in bytes) + const grid_t* g ); // Complete the pending recv on the given port. Only valid to call if // there is a pending recv. Returns pointer to a buffer containing // the received data. (FIXME: WHAT HAPPENS IF EXPECTED RECV SIZE // GIVEN IN BEGIN_RECV DOES NOT MATCH END_RECV??) -void * ALIGNED(128) -end_recv_port( int i, // x port coord ([-1,0,1]) - int j, // y port coord ([-1,0,1]) - int k, // z port coord ([-1,0,1]) - const grid_t * g ); +void* ALIGNED( 128 ) end_recv_port( int i, // x port coord ([-1,0,1]) + int j, // y port coord ([-1,0,1]) + int k, // z port coord ([-1,0,1]) + const grid_t* g ); // Complete the pending send on the given port. Only valid to call if // there is a pending send on the port. Note that this guarantees @@ -295,11 +286,10 @@ end_recv_port( int i, // x port coord ([-1,0,1]) // necessarily that the message has arrived at the destination of the // port. -void -end_send_port( int i, // x port coord ([-1,0,1]) - int j, // y port coord ([-1,0,1]) - int k, // z port coord ([-1,0,1]) - const grid_t * g ); +void end_send_port( int i, // x port coord ([-1,0,1]) + int j, // y port coord ([-1,0,1]) + int k, // z port coord ([-1,0,1]) + const grid_t* g ); // In distribute_voxels.c @@ -311,26 +301,33 @@ end_send_port( int i, // x port coord ([-1,0,1]) // ordering (e.g. inner loop increments x-index). // // jobs are indexed from 0 to n_job-1. jobs are _always_ have the -// number of voxels an integer multiple of the bundle size. If job +// number of voxels an integer multiple of the bundle size. If job // is set to n_job, this function will determine the parameters of // the final incomplete bundle. -#define DISTRIBUTE_VOXELS( x0,x1, y0,y1, z0,z1, b, p,P, x,y,z,nv ) do { \ - int _x0=(x0), _y0=(y0), _z0=(z0), _b=(b), _p=(p), _P=(P); \ - int _nx = (x1)-_x0+1, _ny = (y1)-_y0+1, _nv = _nx*_ny*((z1)-_z0+1); \ - double _t = (double)( _nv/_b ) / (double)_P; \ - int _x=_b*(int)( _t*(double)(_p ) + 0.5 ), _y, _z; \ - if( _p<_P ) _nv=_b*(int)( _t*(double)(_p+1) + 0.5 ); \ - _nv -= _x; /* x = (x-x0) + nx*((y-y0) + ny*(z-z0)) */ \ - _y = _nx ? (_x/_nx) : 0; /* y = (y-y0) + ny*(z-z0) */ \ - _z = _ny ? (_y/_ny) : 0; /* z = (z-z0) */ \ - _x -= _y*_nx; /* x = (x-x0) */ \ - _y -= _z*_ny; /* y = (y-y0) */ \ - (x) = _x+_x0; \ - (y) = _y+_y0; \ - (z) = _z+_z0; \ - (nv) = _nv; \ - } while(0) +#define DISTRIBUTE_VOXELS( x0, x1, y0, y1, z0, z1, b, p, P, x, y, z, nv ) \ + do \ + { \ + int _x0 = ( x0 ), _y0 = ( y0 ), _z0 = ( z0 ), _b = ( b ), _p = ( p ), \ + _P = ( P ); \ + int _nx = (x1)-_x0 + 1, _ny = (y1)-_y0 + 1, \ + _nv = _nx * _ny * ( (z1)-_z0 + 1 ); \ + double _t = (double)( _nv / _b ) / (double)_P; \ + int _x = _b * (int)( _t * (double)( _p ) + 0.5 ), _y, _z; \ + if ( _p < _P ) \ + _nv = _b * (int)( _t * (double)( _p + 1 ) + 0.5 ); \ + _nv -= _x; /* x = (x-x0) + nx*((y-y0) + ny*(z-z0)) */ \ + _y = \ + _nx ? ( _x / _nx ) : 0; /* y = (y-y0) + ny*(z-z0) */ \ + _z = \ + _ny ? ( _y / _ny ) : 0; /* z = (z-z0) */ \ + _x -= _y * _nx; /* x = (x-x0) */ \ + _y -= _z * _ny; /* y = (y-y0) */ \ + ( x ) = _x + _x0; \ + ( y ) = _y + _y0; \ + ( z ) = _z + _z0; \ + ( nv ) = _nv; \ + } while ( 0 ) END_C_DECLS diff --git a/src/material/material.h b/src/material/material.h index a8258512..0482f599 100644 --- a/src/material/material.h +++ b/src/material/material.h @@ -3,53 +3,44 @@ #include "../util/util.h" -enum { - max_material = 32768 // Valid materials are numbered 0...32767 +enum +{ + max_material = 32768 // Valid materials are numbered 0...32767 }; typedef int16_t material_id; -typedef struct material { - char * name; // Name of the material - float epsx, epsy, epsz; // Relative permittivity along x,y,z axes - float mux, muy, muz; // Relative permeability along x,y,z axes - float sigmax, sigmay, sigmaz; // Electrical conductivity along x,y,z axes - float zetax, zetay, zetaz; // Magnetic conductivity along x,y,z axes - material_id id; // Unique identifier for material - struct material *next; // Next material in list +typedef struct material +{ + char* name; // Name of the material + float epsx, epsy, epsz; // Relative permittivity along x,y,z axes + float mux, muy, muz; // Relative permeability along x,y,z axes + float sigmax, sigmay, sigmaz; // Electrical conductivity along x,y,z axes + float zetax, zetay, zetaz; // Magnetic conductivity along x,y,z axes + material_id id; // Unique identifier for material + struct material* next; // Next material in list } material_t; - + BEGIN_C_DECLS // In material.c -int -num_material( const material_t * m_list ); +int num_material( const material_t* m_list ); -void -delete_material_list( material_t * m_list ); +void delete_material_list( material_t* m_list ); -material_t * -find_material_id( material_id id, - material_t * m_list ); +material_t* find_material_id( material_id id, material_t* m_list ); -material_t * -find_material_name( const char * name, - material_t * m_list ); +material_t* find_material_name( const char* name, material_t* m_list ); -material_t * -append_material( material_t * m, - material_t ** m_list ); +material_t* append_material( material_t* m, material_t** m_list ); -material_id -get_material_id( const material_t * m ); +material_id get_material_id( const material_t* m ); -material_t * -material( const char * name, - float epsx, float epsy, float epsz, - float mux, float muy, float muz, - float sigmax, float sigmay, float sigmaz, - float zetax, float zetay, float zetaz ); +material_t* material( const char* name, float epsx, float epsy, float epsz, + float mux, float muy, float muz, float sigmax, + float sigmay, float sigmaz, float zetax, float zetay, + float zetaz ); END_C_DECLS diff --git a/src/sf_interface/pipeline/sf_interface_pipeline.h b/src/sf_interface/pipeline/sf_interface_pipeline.h index 5b1a4d55..463a02c8 100644 --- a/src/sf_interface/pipeline/sf_interface_pipeline.h +++ b/src/sf_interface/pipeline/sf_interface_pipeline.h @@ -12,48 +12,43 @@ typedef struct load_interpolator_pipeline_args { - MEM_PTR( interpolator_t, 128 ) fi; - MEM_PTR( const field_t, 128 ) f; - MEM_PTR( const int64_t, 128 ) nb; - int nx; - int ny; - int nz; + MEM_PTR( interpolator_t, 128 ) fi; + MEM_PTR( const field_t, 128 ) f; + MEM_PTR( const int64_t, 128 ) nb; + int nx; + int ny; + int nz; - PAD_STRUCT( 3*SIZEOF_MEM_PTR + 3*sizeof(int) ) + PAD_STRUCT( 3 * SIZEOF_MEM_PTR + 3 * sizeof( int ) ) } load_interpolator_pipeline_args_t; -void -load_interpolator_pipeline_scalar( load_interpolator_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void load_interpolator_pipeline_scalar( load_interpolator_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -load_interpolator_pipeline_v4( load_interpolator_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void load_interpolator_pipeline_v4( load_interpolator_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// typedef struct unload_accumulator_pipeline_args { - MEM_PTR( field_t, 128 ) f; // Reduce accumulators to this - MEM_PTR( const accumulator_t, 128 ) a; // Accumulator array to reduce - int nx; // Local domain x-resolution - int ny; // Local domain y-resolution - int nz; // Local domain z-resolution - float cx; // x-axis coupling constant - float cy; // y-axis coupling constant - float cz; // z-axis coupling constant + MEM_PTR( field_t, 128 ) f; // Reduce accumulators to this + MEM_PTR( const accumulator_t, 128 ) a; // Accumulator array to reduce + int nx; // Local domain x-resolution + int ny; // Local domain y-resolution + int nz; // Local domain z-resolution + float cx; // x-axis coupling constant + float cy; // y-axis coupling constant + float cz; // z-axis coupling constant - PAD_STRUCT( 2*SIZEOF_MEM_PTR + 3*sizeof(int) + 3*sizeof(float) ) + PAD_STRUCT( 2 * SIZEOF_MEM_PTR + 3 * sizeof( int ) + 3 * sizeof( float ) ) } unload_accumulator_pipeline_args_t; -void -unload_accumulator_pipeline_scalar( unload_accumulator_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void unload_accumulator_pipeline_scalar( + unload_accumulator_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// // clear_array_pipeline interface @@ -62,46 +57,40 @@ unload_accumulator_pipeline_scalar( unload_accumulator_pipeline_args_t * args, // (16KB) which is particularly convenient on Cell. The pipeline // dispatcher will handle any stragglers. -enum { accumulators_n_block = 256, hydro_n_block = 1 }; +enum +{ + accumulators_n_block = 256, + hydro_n_block = 1 +}; typedef struct reduce_pipeline_args { - MEM_PTR(float, 128) a; // First array element to reduce - int n; // Number of array elements to reduce - int n_array; // Number of pipeline arrays - int s_array; // Stride between each array - int n_block; // Number of floats/block. + MEM_PTR( float, 128 ) a; // First array element to reduce + int n; // Number of array elements to reduce + int n_array; // Number of pipeline arrays + int s_array; // Stride between each array + int n_block; // Number of floats/block. - PAD_STRUCT( SIZEOF_MEM_PTR + 4*sizeof(int) ) + PAD_STRUCT( SIZEOF_MEM_PTR + 4 * sizeof( int ) ) } reduce_pipeline_args_t; -void -clear_array_pipeline_scalar( reduce_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void clear_array_pipeline_scalar( reduce_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// // reduce_array_pipeline interface -void -reduce_array_pipeline_scalar( reduce_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); - -void -reduce_array_pipeline_v4( reduce_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); - -void -reduce_array_pipeline_v8( reduce_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); - -void -reduce_array_pipeline_v16( reduce_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void reduce_array_pipeline_scalar( reduce_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); + +void reduce_array_pipeline_v4( reduce_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); + +void reduce_array_pipeline_v8( reduce_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); + +void reduce_array_pipeline_v16( reduce_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _sf_interface_pipeline_h_ diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h index 7202dc78..816212fb 100644 --- a/src/sf_interface/sf_interface.h +++ b/src/sf_interface/sf_interface.h @@ -24,23 +24,21 @@ //----------------------------------------------------------------------------// // 64-byte align -#if defined(USE_V16_PORTABLE) || \ - defined(USE_V16_AVX512) +#if defined( USE_V16_PORTABLE ) || defined( USE_V16_AVX512 ) #define PAD_SIZE_INTERPOLATOR 14 -#define PAD_SIZE_ACCUMULATOR 4 -#define PAD_SIZE_HYDRO 2 +#define PAD_SIZE_ACCUMULATOR 4 +#define PAD_SIZE_HYDRO 2 //----------------------------------------------------------------------------// // 32-byte align -#elif defined(USE_V8_PORTABLE) || \ - defined(USE_V8_AVX) || \ - defined(USE_V8_AVX2) +#elif defined( USE_V8_PORTABLE ) || defined( USE_V8_AVX ) || \ + defined( USE_V8_AVX2 ) #define PAD_SIZE_INTERPOLATOR 6 -#define PAD_SIZE_ACCUMULATOR 4 -#define PAD_SIZE_HYDRO 2 +#define PAD_SIZE_ACCUMULATOR 4 +#define PAD_SIZE_HYDRO 2 //----------------------------------------------------------------------------// // 16-byte align @@ -48,7 +46,7 @@ #else #define PAD_SIZE_INTERPOLATOR 2 -#define PAD_SIZE_HYDRO 2 +#define PAD_SIZE_HYDRO 2 #endif @@ -61,33 +59,31 @@ typedef struct interpolator { - float ex, dexdy, dexdz, d2exdydz; - float ey, deydz, deydx, d2eydzdx; - float ez, dezdx, dezdy, d2ezdxdy; - float cbx, dcbxdx; - float cby, dcbydy; - float cbz, dcbzdz; - float _pad1[PAD_SIZE_INTERPOLATOR]; - // float _pad1[2]; // 16-byte align - // float _pad2[4]; // More padding to get 32-byte align, make conditional - // float _pad3[8]; // More padding to get 64-byte align, make conditional + float ex, dexdy, dexdz, d2exdydz; + float ey, deydz, deydx, d2eydzdx; + float ez, dezdx, dezdy, d2ezdxdy; + float cbx, dcbxdx; + float cby, dcbydy; + float cbz, dcbzdz; + float _pad1[PAD_SIZE_INTERPOLATOR]; + // float _pad1[2]; // 16-byte align + // float _pad2[4]; // More padding to get 32-byte align, make conditional + // float _pad3[8]; // More padding to get 64-byte align, make conditional } interpolator_t; typedef struct interpolator_array { - interpolator_t * ALIGNED(128) i; - grid_t * g; + interpolator_t* ALIGNED( 128 ) i; + grid_t* g; } interpolator_array_t; BEGIN_C_DECLS // In interpolator_array.cc -interpolator_array_t * -new_interpolator_array( grid_t * g ); +interpolator_array_t* new_interpolator_array( grid_t* g ); -void -delete_interpolator_array( interpolator_array_t * ALIGNED(128) ia ); +void delete_interpolator_array( interpolator_array_t* ALIGNED( 128 ) ia ); // Going into load_interpolator, the field array f contains the // current information such that the fields can be interpolated to @@ -96,9 +92,8 @@ delete_interpolator_array( interpolator_array_t * ALIGNED(128) ia ); // inside the local domain suitable for use by the particle update // functions. -void -load_interpolator_array( /**/ interpolator_array_t * RESTRICT ia, - const field_array_t * RESTRICT fa ); +void load_interpolator_array( /**/ interpolator_array_t* RESTRICT ia, + const field_array_t* RESTRICT fa ); END_C_DECLS @@ -114,38 +109,35 @@ END_C_DECLS typedef struct accumulator { - float jx[4]; // jx0@(0,-1,-1),jx1@(0,1,-1),jx2@(0,-1,1),jx3@(0,1,1) - float jy[4]; // jy0@(-1,0,-1),jy1@(-1,0,1),jy2@(1,0,-1),jy3@(1,0,1) - float jz[4]; // jz0@(-1,-1,0),jz1@(1,-1,0),jz2@(-1,1,0),jz3@(1,1,0) - #if defined PAD_SIZE_ACCUMULATOR - float pad2[PAD_SIZE_ACCUMULATOR]; // Padding for 32 and 64-byte align - #endif + float jx[4]; // jx0@(0,-1,-1),jx1@(0,1,-1),jx2@(0,-1,1),jx3@(0,1,1) + float jy[4]; // jy0@(-1,0,-1),jy1@(-1,0,1),jy2@(1,0,-1),jy3@(1,0,1) + float jz[4]; // jz0@(-1,-1,0),jz1@(1,-1,0),jz2@(-1,1,0),jz3@(1,1,0) +#if defined PAD_SIZE_ACCUMULATOR + float pad2[PAD_SIZE_ACCUMULATOR]; // Padding for 32 and 64-byte align +#endif } accumulator_t; typedef struct accumulator_array { - accumulator_t * ALIGNED(128) a; - int n_pipeline; // Number of pipelines supported by this accumulator - int stride; // Stride be each pipeline's accumulator array - grid_t * g; + accumulator_t* ALIGNED( 128 ) a; + int n_pipeline; // Number of pipelines supported by this accumulator + int stride; // Stride be each pipeline's accumulator array + grid_t* g; } accumulator_array_t; BEGIN_C_DECLS // In accumulator_array.cc -accumulator_array_t * -new_accumulator_array( grid_t * g ); +accumulator_array_t* new_accumulator_array( grid_t* g ); -void -delete_accumulator_array( accumulator_array_t * a ); +void delete_accumulator_array( accumulator_array_t* a ); // In clear_array.cc // This zeros out all the accumulator arrays in a pipelined fashion. -void -clear_accumulator_array( accumulator_array_t * RESTRICT a ); +void clear_accumulator_array( accumulator_array_t* RESTRICT a ); // In reduce_array.cc @@ -155,8 +147,7 @@ clear_accumulator_array( accumulator_array_t * RESTRICT a ); // accumulator with a pipelined horizontal reduction (a deterministic // reduction). -void -reduce_accumulator_array( accumulator_array_t * RESTRICT a ); +void reduce_accumulator_array( accumulator_array_t* RESTRICT a ); // In unload_accumulator.cc @@ -169,9 +160,8 @@ reduce_accumulator_array( accumulator_array_t * RESTRICT a ); // local field array jf. unload_accumulator assumes all the pipeline // accumulators have been reduced into the host accumulator. -void -unload_accumulator_array( /**/ field_array_t * RESTRICT fa, - const accumulator_array_t * RESTRICT aa ); +void unload_accumulator_array( /**/ field_array_t* RESTRICT fa, + const accumulator_array_t* RESTRICT aa ); END_C_DECLS @@ -184,19 +174,20 @@ END_C_DECLS typedef struct hydro { - float jx, jy, jz, rho; // Current and charge density => , - float px, py, pz, ke; // Momentum and K.E. density => , - float txx, tyy, tzz; // Stress diagonal => , i==j - float tyz, tzx, txy; // Stress off-diagonal => , i!=j - float _pad[PAD_SIZE_HYDRO]; // 16, 32 and 64-byte align + float jx, jy, jz, rho; // Current and charge density => , + float px, py, pz, + ke; // Momentum and K.E. density => , + float txx, tyy, tzz; // Stress diagonal => , i==j + float tyz, tzx, txy; // Stress off-diagonal => , i!=j + float _pad[PAD_SIZE_HYDRO]; // 16, 32 and 64-byte align } hydro_t; typedef struct hydro_array { - hydro_t * ALIGNED(128) h; - int n_pipeline; // Number of pipelines supported by this hydro - int stride; // Stride be each pipeline's hydro array - grid_t * g; + hydro_t* ALIGNED( 128 ) h; + int n_pipeline; // Number of pipelines supported by this hydro + int stride; // Stride be each pipeline's hydro array + grid_t* g; } hydro_array_t; BEGIN_C_DECLS @@ -205,21 +196,18 @@ BEGIN_C_DECLS // Construct a hydro array suitable for the grid -hydro_array_t * -new_hydro_array( grid_t * g ); +hydro_array_t* new_hydro_array( grid_t* g ); // Destruct a hydro array -void -delete_hydro_array( hydro_array_t * ha ); +void delete_hydro_array( hydro_array_t* ha ); // In clear_array.cc // Zero out the hydro array. Use before accumulating species to // a hydro array. -void -clear_hydro_array( hydro_array_t * ha ); +void clear_hydro_array( hydro_array_t* ha ); // In reduce_array.cc @@ -227,8 +215,7 @@ clear_hydro_array( hydro_array_t * ha ); // synchronize_hydro_array and does not typically need to be otherwise // called. -void -reduce_hydro_array( hydro_array_t * ha ); +void reduce_hydro_array( hydro_array_t* ha ); // In hydro_array.cc @@ -236,8 +223,7 @@ reduce_hydro_array( hydro_array_t * ha ); // the hydro array with local boundary conditions and neighboring // processes. Use after all species have been accumulated to the hydro array. -void -synchronize_hydro_array( hydro_array_t * ha ); +void synchronize_hydro_array( hydro_array_t* ha ); END_C_DECLS diff --git a/src/sf_interface/sf_interface_private.h b/src/sf_interface/sf_interface_private.h index c6c3dae0..93cab78f 100644 --- a/src/sf_interface/sf_interface_private.h +++ b/src/sf_interface/sf_interface_private.h @@ -10,32 +10,26 @@ /////////////////////////////////////////////////////////////////////////////// // load_interpolator_pipeline interface -void -load_interpolator_array_pipeline( interpolator_array_t * RESTRICT ia, - const field_array_t * RESTRICT fa ); +void load_interpolator_array_pipeline( interpolator_array_t* RESTRICT ia, + const field_array_t* RESTRICT fa ); /////////////////////////////////////////////////////////////////////////////// // clear_accumulators_pipeline interface -void -clear_accumulator_array_pipeline( accumulator_array_t * RESTRICT aa ); +void clear_accumulator_array_pipeline( accumulator_array_t* RESTRICT aa ); -void -reduce_accumulator_array_pipeline( accumulator_array_t * RESTRICT aa ); +void reduce_accumulator_array_pipeline( accumulator_array_t* RESTRICT aa ); /////////////////////////////////////////////////////////////////////////////// // clear_hydro_pipeline interface -void -clear_hydro_array_pipeline( hydro_array_t * RESTRICT ha ); +void clear_hydro_array_pipeline( hydro_array_t* RESTRICT ha ); -void -reduce_hydro_array_pipeline( hydro_array_t * RESTRICT ha ); +void reduce_hydro_array_pipeline( hydro_array_t* RESTRICT ha ); /////////////////////////////////////////////////////////////////////////////// -void -unload_accumulator_array_pipeline( field_array_t * RESTRICT fa, - const accumulator_array_t * RESTRICT aa ); +void unload_accumulator_array_pipeline( + field_array_t* RESTRICT fa, const accumulator_array_t* RESTRICT aa ); #endif // _sf_interface_private_h_ diff --git a/src/species_advance/species_advance.h b/src/species_advance/species_advance.h index 20aa54b7..3fb0d671 100644 --- a/src/species_advance/species_advance.h +++ b/src/species_advance/species_advance.h @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -28,33 +28,19 @@ BEGIN_C_DECLS // In species_advance.cc -int -num_species( const species_t * sp_list ); +int num_species( const species_t* sp_list ); -void -delete_species_list( species_t * sp_list ); +void delete_species_list( species_t* sp_list ); -species_t * -find_species_id( species_id id, - species_t * sp_list ); +species_t* find_species_id( species_id id, species_t* sp_list ); -species_t * -find_species_name( const char * name, - species_t * sp_list ); +species_t* find_species_name( const char* name, species_t* sp_list ); -species_t * -append_species( species_t * sp, - species_t ** sp_list ); +species_t* append_species( species_t* sp, species_t** sp_list ); -species_t * -species( const char * name, - float q, - float m, - size_t max_local_np, - size_t max_local_nm, - int sort_interval, - int sort_out_of_place, - grid_t * g ); +species_t* species( const char* name, float q, float m, size_t max_local_np, + size_t max_local_nm, int sort_interval, + int sort_out_of_place, grid_t* g ); // FIXME: TEMPORARY HACK UNTIL THIS SPECIES_ADVANCE KERNELS // CAN BE CONSTRUCTED ANALOGOUS TO THE FIELD_ADVANCE KERNELS @@ -62,23 +48,18 @@ species( const char * name, // In sort_p.cc -void -sort_p( species_t * RESTRICT sp ); +void sort_p( species_t* RESTRICT sp ); -void -sort_p_pipeline( species_t * sp ); +void sort_p_pipeline( species_t* sp ); // In advance_p.cc -void -advance_p( species_t * RESTRICT sp, - accumulator_array_t * RESTRICT aa, - const interpolator_array_t * RESTRICT ia ); +void advance_p( species_t* RESTRICT sp, accumulator_array_t* RESTRICT aa, + const interpolator_array_t* RESTRICT ia ); -void -advance_p_pipeline( species_t * RESTRICT sp, - accumulator_array_t * RESTRICT aa, - const interpolator_array_t * RESTRICT ia ); +void advance_p_pipeline( species_t* RESTRICT sp, + accumulator_array_t* RESTRICT aa, + const interpolator_array_t* RESTRICT ia ); // In center_p.cc @@ -87,13 +68,11 @@ advance_p_pipeline( species_t * RESTRICT sp, // half a step stale is moved second order accurate to have r and u on // the time step. -void -center_p( species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +void center_p( species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); -void -center_p_pipeline( species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +void center_p_pipeline( species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); // In uncenter_p.cc @@ -101,13 +80,11 @@ center_p_pipeline( species_t * RESTRICT sp, // the time step are adjusted to have r at the time step and u half a // step stale. -void -uncenter_p( species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +void uncenter_p( species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); -void -uncenter_p_pipeline( species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +void uncenter_p_pipeline( species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); // In energy.cc @@ -115,46 +92,38 @@ uncenter_p_pipeline( species_t * RESTRICT sp, // calculation is done numerically robustly. All nodes get the same // result. -double -energy_p( const species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +double energy_p( const species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); -double -energy_p_pipeline( const species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +double energy_p_pipeline( const species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); // In rho_p.cc -void -accumulate_rho_p( field_array_t * RESTRICT fa, - const species_t * RESTRICT sp ); +void accumulate_rho_p( field_array_t* RESTRICT fa, + const species_t* RESTRICT sp ); -void -accumulate_rhob( field_t * RESTRICT ALIGNED(128) f, - const particle_t * RESTRICT ALIGNED(32) p, - const grid_t * RESTRICT g, - const float qsp ); +void accumulate_rhob( field_t* RESTRICT ALIGNED( 128 ) f, + const particle_t* RESTRICT ALIGNED( 32 ) p, + const grid_t* RESTRICT g, const float qsp ); // In hydro_p.cc -void -accumulate_hydro_p( hydro_array_t * RESTRICT ha, - const species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +void accumulate_hydro_p( hydro_array_t* RESTRICT ha, + const species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); -void -accumulate_hydro_p_pipeline( hydro_array_t * RESTRICT ha, - const species_t * RESTRICT sp, - const interpolator_array_t * RESTRICT ia ); +void accumulate_hydro_p_pipeline( hydro_array_t* RESTRICT ha, + const species_t* RESTRICT sp, + const interpolator_array_t* RESTRICT ia ); // In move_p.cc -int -move_p( particle_t * ALIGNED(128) p0, // Particle array - particle_mover_t * ALIGNED(16) m, // Particle mover to apply - accumulator_t * ALIGNED(128) a0, // Accumulator to use - const grid_t * g, // Grid parameters - const float qsp ); // Species particle charge +int move_p( particle_t* ALIGNED( 128 ) p0, // Particle array + particle_mover_t* ALIGNED( 16 ) m, // Particle mover to apply + accumulator_t* ALIGNED( 128 ) a0, // Accumulator to use + const grid_t* g, // Grid parameters + const float qsp ); // Species particle charge END_C_DECLS diff --git a/src/species_advance/species_advance_aos.h b/src/species_advance/species_advance_aos.h index 3e1af9ad..60e60577 100644 --- a/src/species_advance/species_advance_aos.h +++ b/src/species_advance/species_advance_aos.h @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -18,79 +18,83 @@ typedef int32_t species_id; // Must be 32-bit wide for particle_injector_t // (maybe) should be opaque and specific to a particular // species_advance implementation -typedef struct particle { - float dx, dy, dz; // Particle position in cell coordinates (on [-1,1]) - int32_t i; // Voxel containing the particle. Note that - /**/ // particles awaiting processing by boundary_p - /**/ // have actually set this to 8*voxel + face where - /**/ // face is the index of the face they interacted - /**/ // with (on 0:5). This limits the local number of - /**/ // voxels to 2^28 but emitter handling already - /**/ // has a stricter limit on this (2^26). - float ux, uy, uz; // Particle normalized momentum - float w; // Particle weight (number of physical particles) +typedef struct particle +{ + float dx, dy, dz; // Particle position in cell coordinates (on [-1,1]) + int32_t i; // Voxel containing the particle. Note that + /**/ // particles awaiting processing by boundary_p + /**/ // have actually set this to 8*voxel + face where + /**/ // face is the index of the face they interacted + /**/ // with (on 0:5). This limits the local number of + /**/ // voxels to 2^28 but emitter handling already + /**/ // has a stricter limit on this (2^26). + float ux, uy, uz; // Particle normalized momentum + float w; // Particle weight (number of physical particles) } particle_t; // WARNING: FUNCTIONS THAT USE A PARTICLE_MOVER ASSUME THAT EVERYBODY // WHO USES THAT PARTICLE MOVER WILL HAVE ACCESS TO PARTICLE ARRAY -typedef struct particle_mover { - float dispx, dispy, dispz; // Displacement of particle - int32_t i; // Index of the particle to move +typedef struct particle_mover +{ + float dispx, dispy, dispz; // Displacement of particle + int32_t i; // Index of the particle to move } particle_mover_t; // NOTE: THE LAYOUT OF A PARTICLE_INJECTOR _MUST_ BE COMPATIBLE WITH // THE CONCATENATION OF A PARTICLE_T AND A PARTICLE_MOVER! -typedef struct particle_injector { - float dx, dy, dz; // Particle position in cell coords (on [-1,1]) - int32_t i; // Index of cell containing the particle - float ux, uy, uz; // Particle normalized momentum - float w; // Particle weight (number of physical particles) - float dispx, dispy, dispz; // Displacement of particle - species_id sp_id; // Species of particle +typedef struct particle_injector +{ + float dx, dy, dz; // Particle position in cell coords (on [-1,1]) + int32_t i; // Index of cell containing the particle + float ux, uy, uz; // Particle normalized momentum + float w; // Particle weight (number of physical particles) + float dispx, dispy, dispz; // Displacement of particle + species_id sp_id; // Species of particle } particle_injector_t; -typedef struct species { - char * name; // Species name - float q; // Species particle charge - float m; // Species particle rest mass +typedef struct species +{ + char* name; // Species name + float q; // Species particle charge + float m; // Species particle rest mass - int np, max_np; // Number and max local particles - particle_t * ALIGNED(128) p; // Array of particles for the species + int np, max_np; // Number and max local particles + particle_t* ALIGNED( 128 ) p; // Array of particles for the species - int nm, max_nm; // Number and max local movers in use - particle_mover_t * ALIGNED(128) pm; // Particle movers + int nm, max_nm; // Number and max local movers in use + particle_mover_t* ALIGNED( 128 ) pm; // Particle movers - int64_t last_sorted; // Step when the particles were last - // sorted. - int sort_interval; // How often to sort the species - int sort_out_of_place; // Sort method - int * ALIGNED(128) partition; // Static array indexed 0: - /**/ // (nx+2)*(ny+2)*(nz+2). Each value - /**/ // corresponds to the associated particle - /**/ // array index of the first particle in - /**/ // the cell. Array is allocated and - /**/ // values computed in sort_p. Purpose is - /**/ // for implementing collision models - /**/ // This is given in terms of the - /**/ // underlying's grids space filling - /**/ // curve indexing. Thus, immediately - /**/ // after a sort: - /**/ // sp->p[sp->partition[g->sfc[i] ]: - /**/ // sp->partition[g->sfc[i]+1]-1] - /**/ // are all the particles in voxel - /**/ // with local index i, while: - /**/ // sp->p[ sp->partition[ j ]: - /**/ // sp->partition[ j+1 ] ] - /**/ // are all the particles in voxel - /**/ // with space filling curve index j. - /**/ // Note: SFC NOT IN USE RIGHT NOW THUS - /**/ // g->sfc[i]=i ABOVE. + int64_t last_sorted; // Step when the particles were last + // sorted. + int sort_interval; // How often to sort the species + int sort_out_of_place; // Sort method + int* ALIGNED( 128 ) partition; // Static array indexed 0: + /**/ // (nx+2)*(ny+2)*(nz+2). Each value + /**/ // corresponds to the associated particle + /**/ // array index of the first particle in + /**/ // the cell. Array is allocated and + /**/ // values computed in sort_p. Purpose is + /**/ // for implementing collision models + /**/ // This is given in terms of the + /**/ // underlying's grids space filling + /**/ // curve indexing. Thus, immediately + /**/ // after a sort: + /**/ // sp->p[sp->partition[g->sfc[i] ]: + /**/ // sp->partition[g->sfc[i]+1]-1] + /**/ // are all the particles in voxel + /**/ // with local index i, while: + /**/ // sp->p[ sp->partition[ j ]: + /**/ // sp->partition[ j+1 ] ] + /**/ // are all the particles in voxel + /**/ // with space filling curve index j. + /**/ // Note: SFC NOT IN USE RIGHT NOW THUS + /**/ // g->sfc[i]=i ABOVE. - grid_t * g; // Underlying grid - species_id id; // Unique identifier for a species - struct species *next; // Next species in the list + grid_t* g; // Underlying grid + species_id id; // Unique identifier for a species + struct species* next; // Next species in the list } species_t; #endif // _species_advance_aos_h_ diff --git a/src/species_advance/standard/pipeline/spa_private.h b/src/species_advance/standard/pipeline/spa_private.h index f5c727ca..84206249 100644 --- a/src/species_advance/standard/pipeline/spa_private.h +++ b/src/species_advance/standard/pipeline/spa_private.h @@ -12,181 +12,142 @@ typedef struct particle_mover_seg { - MEM_PTR( particle_mover_t, 16 ) pm; // First mover in segment - int max_nm; // Maximum number of movers - int nm; // Number of movers used - int n_ignored; // Number of movers ignored + MEM_PTR( particle_mover_t, 16 ) pm; // First mover in segment + int max_nm; // Maximum number of movers + int nm; // Number of movers used + int n_ignored; // Number of movers ignored - PAD_STRUCT( SIZEOF_MEM_PTR+3*sizeof(int) ) + PAD_STRUCT( SIZEOF_MEM_PTR + 3 * sizeof( int ) ) } particle_mover_seg_t; typedef struct advance_p_pipeline_args { - MEM_PTR( particle_t, 128 ) p0; // Particle array - MEM_PTR( particle_mover_t, 128 ) pm; // Particle mover array - MEM_PTR( accumulator_t, 128 ) a0; // Accumulator arrays - MEM_PTR( const interpolator_t, 128 ) f0; // Interpolator array - MEM_PTR( particle_mover_seg_t, 128 ) seg; // Dest for return values - MEM_PTR( const grid_t, 1 ) g; // Local domain grid params - - float qdt_2mc; // Particle/field coupling - float cdt_dx; // x-space/time coupling - float cdt_dy; // y-space/time coupling - float cdt_dz; // z-space/time coupling - float qsp; // Species particle charge - - int np; // Number of particles - int max_nm; // Number of movers - int nx; // x-mesh resolution - int ny; // y-mesh resolution - int nz; // z-mesh resolution - - PAD_STRUCT( 6*SIZEOF_MEM_PTR + 5*sizeof(float) + 5*sizeof(int) ) + MEM_PTR( particle_t, 128 ) p0; // Particle array + MEM_PTR( particle_mover_t, 128 ) pm; // Particle mover array + MEM_PTR( accumulator_t, 128 ) a0; // Accumulator arrays + MEM_PTR( const interpolator_t, 128 ) f0; // Interpolator array + MEM_PTR( particle_mover_seg_t, 128 ) seg; // Dest for return values + MEM_PTR( const grid_t, 1 ) g; // Local domain grid params + + float qdt_2mc; // Particle/field coupling + float cdt_dx; // x-space/time coupling + float cdt_dy; // y-space/time coupling + float cdt_dz; // z-space/time coupling + float qsp; // Species particle charge + + int np; // Number of particles + int max_nm; // Number of movers + int nx; // x-mesh resolution + int ny; // y-mesh resolution + int nz; // z-mesh resolution + + PAD_STRUCT( 6 * SIZEOF_MEM_PTR + 5 * sizeof( float ) + 5 * sizeof( int ) ) } advance_p_pipeline_args_t; -void -advance_p_pipeline_scalar( advance_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_p_pipeline_scalar( advance_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -advance_p_pipeline_v4( advance_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_p_pipeline_v4( advance_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_p_pipeline_v8( advance_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_p_pipeline_v8( advance_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -advance_p_pipeline_v16( advance_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void advance_p_pipeline_v16( advance_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// // center_p_pipeline and uncenter_p_pipeline interface typedef struct center_p_pipeline_args { - MEM_PTR( particle_t, 128 ) p0; // Particle array - MEM_PTR( const interpolator_t, 128 ) f0; // Interpolator array - float qdt_2mc; // Particle/field coupling - int np; // Number of particles + MEM_PTR( particle_t, 128 ) p0; // Particle array + MEM_PTR( const interpolator_t, 128 ) f0; // Interpolator array + float qdt_2mc; // Particle/field coupling + int np; // Number of particles - PAD_STRUCT( 2*SIZEOF_MEM_PTR + sizeof(float) + sizeof(int) ) + PAD_STRUCT( 2 * SIZEOF_MEM_PTR + sizeof( float ) + sizeof( int ) ) } center_p_pipeline_args_t; -void -center_p_pipeline_scalar( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); - -void -center_p_pipeline_v4( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); - -void -center_p_pipeline_v8( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); -void -center_p_pipeline_v16( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); - -void -uncenter_p_pipeline_scalar( center_p_pipeline_args_t * args, - int pipeline_rank, +void center_p_pipeline_scalar( center_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); + +void center_p_pipeline_v4( center_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); + +void center_p_pipeline_v8( center_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); +void center_p_pipeline_v16( center_p_pipeline_args_t* args, int pipeline_rank, int n_pipeline ); -void -uncenter_p_pipeline_v4( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void uncenter_p_pipeline_scalar( center_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); + +void uncenter_p_pipeline_v4( center_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -uncenter_p_pipeline_v8( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void uncenter_p_pipeline_v8( center_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -uncenter_p_pipeline_v16( center_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void uncenter_p_pipeline_v16( center_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// // energy_p_pipeline interface typedef struct energy_p_pipeline_args { - MEM_PTR( const particle_t, 128 ) p; // Particle array - MEM_PTR( const interpolator_t, 128 ) f; // Interpolator array - MEM_PTR( double, 128 ) en; // Return values - float qdt_2mc; // Particle/field coupling - float msp; // Species particle rest mass - int np; // Number of particles - - PAD_STRUCT( 3*SIZEOF_MEM_PTR + 2*sizeof(float) + sizeof(int) ) + MEM_PTR( const particle_t, 128 ) p; // Particle array + MEM_PTR( const interpolator_t, 128 ) f; // Interpolator array + MEM_PTR( double, 128 ) en; // Return values + float qdt_2mc; // Particle/field coupling + float msp; // Species particle rest mass + int np; // Number of particles + + PAD_STRUCT( 3 * SIZEOF_MEM_PTR + 2 * sizeof( float ) + sizeof( int ) ) } energy_p_pipeline_args_t; -void -energy_p_pipeline_scalar( energy_p_pipeline_args_t * RESTRICT args, - int pipeline_rank, - int n_pipeline ); +void energy_p_pipeline_scalar( energy_p_pipeline_args_t* RESTRICT args, + int pipeline_rank, int n_pipeline ); -void -energy_p_pipeline_v4( energy_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void energy_p_pipeline_v4( energy_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -energy_p_pipeline_v8( energy_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void energy_p_pipeline_v8( energy_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -energy_p_pipeline_v16( energy_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void energy_p_pipeline_v16( energy_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// // accumulate_hydro_p_pipeline interface typedef struct accumulate_hydro_p_pipeline_args { - MEM_PTR( const species_t, 128 ) sp; // Species array - MEM_PTR( const interpolator_t, 128 ) f; // Interpolator array - MEM_PTR( /**/ hydro_t, 128 ) h; // Hydro values - int h_size; // Size of each hydro array - float qdt_2mc; // Particle/field coupling - float msp; // Species particle rest mass - int np; // Number of particles - - PAD_STRUCT( 3*SIZEOF_MEM_PTR + 2*sizeof(float) + 2*sizeof(int) ) + MEM_PTR( const species_t, 128 ) sp; // Species array + MEM_PTR( const interpolator_t, 128 ) f; // Interpolator array + MEM_PTR( /**/ hydro_t, 128 ) h; // Hydro values + int h_size; // Size of each hydro array + float qdt_2mc; // Particle/field coupling + float msp; // Species particle rest mass + int np; // Number of particles + + PAD_STRUCT( 3 * SIZEOF_MEM_PTR + 2 * sizeof( float ) + 2 * sizeof( int ) ) } accumulate_hydro_p_pipeline_args_t; -void -accumulate_hydro_p_pipeline_scalar( accumulate_hydro_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void accumulate_hydro_p_pipeline_scalar( + accumulate_hydro_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); -void -accumulate_hydro_p_pipeline_v4( accumulate_hydro_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void accumulate_hydro_p_pipeline_v4( accumulate_hydro_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -accumulate_hydro_p_pipeline_v8( accumulate_hydro_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void accumulate_hydro_p_pipeline_v8( accumulate_hydro_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -accumulate_hydro_p_pipeline_v16( accumulate_hydro_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void accumulate_hydro_p_pipeline_v16( accumulate_hydro_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); /////////////////////////////////////////////////////////////////////////////// // sort_p_pipeline interface @@ -196,8 +157,9 @@ accumulate_hydro_p_pipeline_v16( accumulate_hydro_p_pipeline_args_t * args, // that v*P might overflow 32-bits and that only voxels [vl,vh] // may contain particles. This macro is mostly robust. -#define V2P( v, P, vl, vh ) ( (((int64_t)((v )-(vl) ))*((int64_t)(P))) / \ - ((int64_t)((vh)-(vl)+1)) ) +#define V2P( v, P, vl, vh ) \ + ( ( ( ( int64_t )( ( v ) - ( vl ) ) ) * ( ( int64_t )( P ) ) ) / \ + ( ( int64_t )( ( vh ) - ( vl ) + 1 ) ) ) // Given the pipeline rank, compute the first voxel a subsort is // responsible for handling. This is based on: @@ -211,41 +173,37 @@ accumulate_hydro_p_pipeline_v16( accumulate_hydro_p_pipeline_args_t * args, // where v above is v-vl and V = vh-vl+1. This takes into account // that p*V might overflow 32-bits. This macro is mostly robust. -#define P2V( p, P, vl, vh ) \ - ((vl)+((((int64_t)(p))*((int64_t)((vh)-(vl)+1)) + ((int64_t)((P)-1))) / \ - ((int64_t)(P)))) +#define P2V( p, P, vl, vh ) \ + ( ( vl ) + \ + ( ( ( ( int64_t )( p ) ) * ( ( int64_t )( ( vh ) - ( vl ) + 1 ) ) + \ + ( ( int64_t )( (P)-1 ) ) ) / \ + ( ( int64_t )( P ) ) ) ) // FIXME: safe to remove? enum { max_subsort_voxel = 26624 }; typedef struct sort_p_pipeline_args { - MEM_PTR( particle_t, 128 ) p; // Particles (0:n-1) - MEM_PTR( particle_t, 128 ) aux_p; // Aux particle atorage (0:n-1) - MEM_PTR( int, 128 ) coarse_partition; // Coarse partition storage - /**/ // (0:max_subsort-1,0:MAX_PIPELINE-1) - MEM_PTR( int, 128 ) partition; // Partitioning (0:n_voxel) - MEM_PTR( int, 128 ) next; // Aux partitioning (0:n_voxel) - int n; // Number of particles - int n_subsort; // Number of pipelines to be used for subsorts - int vl, vh; // Particles may be contained in voxels [vl,vh]. - int n_voxel; // Number of voxels total (including ghosts) - - PAD_STRUCT( 5*SIZEOF_MEM_PTR + 5*sizeof(int) ) + MEM_PTR( particle_t, 128 ) p; // Particles (0:n-1) + MEM_PTR( particle_t, 128 ) aux_p; // Aux particle atorage (0:n-1) + MEM_PTR( int, 128 ) coarse_partition; // Coarse partition storage + /**/ // (0:max_subsort-1,0:MAX_PIPELINE-1) + MEM_PTR( int, 128 ) partition; // Partitioning (0:n_voxel) + MEM_PTR( int, 128 ) next; // Aux partitioning (0:n_voxel) + int n; // Number of particles + int n_subsort; // Number of pipelines to be used for subsorts + int vl, vh; // Particles may be contained in voxels [vl,vh]. + int n_voxel; // Number of voxels total (including ghosts) + + PAD_STRUCT( 5 * SIZEOF_MEM_PTR + 5 * sizeof( int ) ) } sort_p_pipeline_args_t; -void -coarse_count_pipeline_scalar( sort_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void coarse_count_pipeline_scalar( sort_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -coarse_sort_pipeline_scalar( sort_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void coarse_sort_pipeline_scalar( sort_p_pipeline_args_t* args, + int pipeline_rank, int n_pipeline ); -void -subsort_pipeline_scalar( sort_p_pipeline_args_t * args, - int pipeline_rank, - int n_pipeline ); +void subsort_pipeline_scalar( sort_p_pipeline_args_t* args, int pipeline_rank, + int n_pipeline ); #endif // _spa_private_h_ diff --git a/src/util/bitfield.h b/src/util/bitfield.h index 1bca9168..97392203 100644 --- a/src/util/bitfield.h +++ b/src/util/bitfield.h @@ -2,97 +2,108 @@ #define BitField_h class BitField - { - public: - - BitField(uint32_t setbits = 0xffffffff) : bits_(setbits) {} - BitField(const BitField & bf) : bits_(bf.bits_) {} - ~BitField() {} +{ + public: + BitField( uint32_t setbits = 0xffffffff ) + : bits_( setbits ) + { + } + BitField( const BitField& bf ) + : bits_( bf.bits_ ) + { + } + ~BitField() {} - /*!--------------------------------------------------------------------- - * Set bits in mask. - ----------------------------------------------------------------------*/ - uint32_t set(uint32_t mask) { - return bits_ |= mask; - } // set + /*!--------------------------------------------------------------------- + * Set bits in mask. + ----------------------------------------------------------------------*/ + uint32_t set( uint32_t mask ) { return bits_ |= mask; } // set - /*!--------------------------------------------------------------------- - * Set individual bit. - ----------------------------------------------------------------------*/ - uint32_t setibit(size_t bit) { - uint32_t tmp = 1<open(name, io_read) != ok) { - ERROR(( "Unable to open \"%s\" for checkpt read", name )); - } // if + if ( fileIO->open( name, io_read ) != ok ) + { + ERROR( ( "Unable to open \"%s\" for checkpt read", name ) ); + } // if - return reinterpret_cast(fileIO); - } // checkpt_open_rdonly + return reinterpret_cast( fileIO ); + } // checkpt_open_rdonly - static checkpt_t * checkpt_open_wronly(const char * name) { - if(!name) ERROR(("NULL name")); + static checkpt_t* checkpt_open_wronly( const char* name ) + { + if ( !name ) + ERROR( ( "NULL name" ) ); - FileIO * fileIO = new FileIO; + FileIO* fileIO = new FileIO; - if(fileIO->open(name, io_write) != ok) { - ERROR(("Unable to open \"%s\" for checkpt read", name)); - } // if + if ( fileIO->open( name, io_write ) != ok ) + { + ERROR( ( "Unable to open \"%s\" for checkpt read", name ) ); + } // if - return reinterpret_cast(fileIO); - } // checkpt_open_wronly + return reinterpret_cast( fileIO ); + } // checkpt_open_wronly - static void checkpt_close(checkpt_t * checkpt) { - FileIO * fileIO = reinterpret_cast(checkpt); + static void checkpt_close( checkpt_t* checkpt ) + { + FileIO* fileIO = reinterpret_cast( checkpt ); - int32_t err = fileIO->close(); + int32_t err = fileIO->close(); - if(err != 0) { - ERROR(("Error closing file (%d)", err)); - } // if + if ( err != 0 ) + { + ERROR( ( "Error closing file (%d)", err ) ); + } // if - delete fileIO; - } // checkpt_close + delete fileIO; + } // checkpt_close - static void checkpt_read(checkpt_t * checkpt, void * data, size_t sz) { - if(!sz) return; - if(!checkpt || !data) ERROR(("Invalid checkpt_read request")); + static void checkpt_read( checkpt_t* checkpt, void* data, size_t sz ) + { + if ( !sz ) + return; + if ( !checkpt || !data ) + ERROR( ( "Invalid checkpt_read request" ) ); - FileIO * fileIO = reinterpret_cast(checkpt); + FileIO* fileIO = reinterpret_cast( checkpt ); - // FIXME: add return values - fileIO->read(reinterpret_cast(data), sz); - } // checkpt_read + // FIXME: add return values + fileIO->read( reinterpret_cast( data ), sz ); + } // checkpt_read - static void checkpt_write(checkpt_t * checkpt, const void * data, - size_t sz) { - if(!sz) return; - if(!checkpt || !data) ERROR(("Invalid checkpt_read request")); + static void checkpt_write( checkpt_t* checkpt, const void* data, size_t sz ) + { + if ( !sz ) + return; + if ( !checkpt || !data ) + ERROR( ( "Invalid checkpt_read request" ) ); - FileIO * fileIO = reinterpret_cast(checkpt); + FileIO* fileIO = reinterpret_cast( checkpt ); - // FIXME: add return values - fileIO->write(reinterpret_cast(data), sz); - } // checkpt_write + // FIXME: add return values + fileIO->write( reinterpret_cast( data ), sz ); + } // checkpt_write }; // struct CheckPtIO diff --git a/src/util/checkpt/checkpt_private.h b/src/util/checkpt/checkpt_private.h index 8fa1c3d2..68a8884c 100644 --- a/src/util/checkpt/checkpt_private.h +++ b/src/util/checkpt/checkpt_private.h @@ -12,24 +12,15 @@ typedef struct checkpt checkpt_t; BEGIN_C_DECLS -checkpt_t * -checkpt_open_rdonly( const char * name ); +checkpt_t* checkpt_open_rdonly( const char* name ); -checkpt_t * -checkpt_open_wronly( const char * name ); +checkpt_t* checkpt_open_wronly( const char* name ); -void -checkpt_close( checkpt_t * checkpt ); +void checkpt_close( checkpt_t* checkpt ); -void -checkpt_read( checkpt_t * checkpt, - void * data, - size_t sz ); +void checkpt_read( checkpt_t* checkpt, void* data, size_t sz ); -void -checkpt_write( checkpt_t * checkpt, - const void * data, - size_t sz ); +void checkpt_write( checkpt_t* checkpt, const void* data, size_t sz ); END_C_DECLS diff --git a/src/util/checksum.h b/src/util/checksum.h index dd21ec86..57c1dbb5 100644 --- a/src/util/checksum.h +++ b/src/util/checksum.h @@ -1,53 +1,57 @@ #ifndef CheckSum_h #define CheckSum_h -#if defined(ENABLE_OPENSSL) +#if defined( ENABLE_OPENSSL ) #include -struct CheckSum { - unsigned char value[EVP_MAX_MD_SIZE]; - char strvalue[EVP_MAX_MD_SIZE*2+1]; - unsigned int length; +struct CheckSum +{ + unsigned char value[EVP_MAX_MD_SIZE]; + char strvalue[EVP_MAX_MD_SIZE * 2 + 1]; + unsigned int length; }; // struct CheckSum -template -void checkSumBuffer(T * buffer, size_t elements, CheckSum & sum, - const char * digest = "md5") { - size_t bytes = elements*sizeof(T); +template +void checkSumBuffer( T* buffer, size_t elements, CheckSum& sum, + const char* digest = "md5" ) +{ + size_t bytes = elements * sizeof( T ); - EVP_MD_CTX ctx; + EVP_MD_CTX ctx; - // add all digests to table - OpenSSL_add_all_digests(); + // add all digests to table + OpenSSL_add_all_digests(); - // initialize context - EVP_MD_CTX_init(&ctx); + // initialize context + EVP_MD_CTX_init( &ctx ); - // get digest - const EVP_MD * md = EVP_get_digestbyname(digest); - if(!md) { - ERROR(("Invalid digest!")); - } // if + // get digest + const EVP_MD* md = EVP_get_digestbyname( digest ); + if ( !md ) + { + ERROR( ( "Invalid digest!" ) ); + } // if - // initialize digest - EVP_DigestInit_ex(&ctx, md, NULL); + // initialize digest + EVP_DigestInit_ex( &ctx, md, NULL ); - // update digest with buffer - EVP_DigestUpdate(&ctx, reinterpret_cast(buffer), bytes); - - // finalize - EVP_DigestFinal_ex(&ctx, sum.value, &sum.length); + // update digest with buffer + EVP_DigestUpdate( &ctx, reinterpret_cast( buffer ), bytes ); - // free resources - EVP_MD_CTX_cleanup(&ctx); + // finalize + EVP_DigestFinal_ex( &ctx, sum.value, &sum.length ); - char tmp[256]; - strcpy(sum.strvalue, ""); - for(size_t i(0); i #include "FileIOData.h" +#include /*! - \class FileIO FileIO.h - \brief provides... + \class FileIO FileIO.h + \brief provides... */ -template struct FileIO_T - : public ReadWritePolicy - { - //! Constructor - FileIO_T() {} - - //! Destructor - ~FileIO_T() {} - - FileIOStatus open(const char * filename, FileIOMode mode) - { return ReadWritePolicy::open(filename, mode); } - int32_t close() - { return ReadWritePolicy::close(); } - - bool isOpen() - { return ReadWritePolicy::isOpen(); } - - int64_t size() - { return ReadWritePolicy::size(); } - - void print(const char * format, ...) { - va_list args; - va_start(args, format); - ReadWritePolicy::print(format, args); - } - - template - size_t read(T * data, size_t elements) - { return ReadWritePolicy::read(data, elements); } - template - size_t write(const T * data, size_t elements) - { return ReadWritePolicy::write(data, elements); } - - int64_t seek(uint64_t offset, int32_t whence) - { return ReadWritePolicy::seek(offset, whence); } - int64_t tell() - { return ReadWritePolicy::tell(); } - void rewind() - { ReadWritePolicy::rewind(); } - - }; // class FileIO_T - +template +struct FileIO_T : public ReadWritePolicy +{ + //! Constructor + FileIO_T() {} + + //! Destructor + ~FileIO_T() {} + + FileIOStatus open( const char* filename, FileIOMode mode ) + { + return ReadWritePolicy::open( filename, mode ); + } + int32_t close() { return ReadWritePolicy::close(); } + + bool isOpen() { return ReadWritePolicy::isOpen(); } + + int64_t size() { return ReadWritePolicy::size(); } + + void print( const char* format, ... ) + { + va_list args; + va_start( args, format ); + ReadWritePolicy::print( format, args ); + } + + template + size_t read( T* data, size_t elements ) + { + return ReadWritePolicy::read( data, elements ); + } + template + size_t write( const T* data, size_t elements ) + { + return ReadWritePolicy::write( data, elements ); + } + + int64_t seek( uint64_t offset, int32_t whence ) + { + return ReadWritePolicy::seek( offset, whence ); + } + int64_t tell() { return ReadWritePolicy::tell(); } + void rewind() { ReadWritePolicy::rewind(); } + +}; // class FileIO_T #if defined USE_MPRELAY @@ -71,10 +74,10 @@ typedef FileIO_T FileIO; #else #include "P2PIOPolicy.h" -//typedef FileIO_T > FileIOSwapped; -//typedef FileIO_T > FileIO; -typedef FileIO_T > FileIO; -typedef FileIO_T > FileIOUnswapped; +// typedef FileIO_T > FileIOSwapped; +// typedef FileIO_T > FileIO; +typedef FileIO_T> FileIO; +typedef FileIO_T> FileIOUnswapped; #endif // BUILD #else diff --git a/src/util/io/FileIOData.h b/src/util/io/FileIOData.h index 537e2e5d..846c5936 100644 --- a/src/util/io/FileIOData.h +++ b/src/util/io/FileIOData.h @@ -1,12 +1,12 @@ /* - Definition of FileIOData class + Definition of FileIOData class - Author: Benjamin Karl Bergen + Author: Benjamin Karl Bergen - $Revision$ - $LastChangedBy$ - $LastChangedDate$ - vim: set ts=3 : + $Revision$ + $LastChangedBy$ + $LastChangedDate$ + vim: set ts=3 : */ #ifndef FileIOData_h @@ -14,19 +14,21 @@ #include -enum FileIOMode { - io_closed, - io_read, - io_read_write, - io_write, - io_write_read, - io_append, - io_append_read +enum FileIOMode +{ + io_closed, + io_read, + io_read_write, + io_write, + io_write_read, + io_append, + io_append_read }; // FileIOMode -enum FileIOStatus { - ok, - fail +enum FileIOStatus +{ + ok, + fail }; // FileIOStatus #endif // FileIOData_h diff --git a/src/util/io/FileUtils.h b/src/util/io/FileUtils.h index 0b3e31c2..6c8e6e5a 100644 --- a/src/util/io/FileUtils.h +++ b/src/util/io/FileUtils.h @@ -1,23 +1,25 @@ #ifndef FileUtils_h #define FileUtils_h -template class FileUtils_T - : public Policy - { - public: - - FileUtils_T() {} - ~FileUtils_T() {} - - static int makeDirectory(const char * dirname) - { return Policy::makeDirectory(dirname); } - - static int getCurrentWorkingDirectory(char * dirname, size_t size) - { return Policy::getCurrentWorkingDirectory(dirname, size); } - - private: - - }; // class FileUtils_T +template +class FileUtils_T : public Policy +{ + public: + FileUtils_T() {} + ~FileUtils_T() {} + + static int makeDirectory( const char* dirname ) + { + return Policy::makeDirectory( dirname ); + } + + static int getCurrentWorkingDirectory( char* dirname, size_t size ) + { + return Policy::getCurrentWorkingDirectory( dirname, size ); + } + + private: +}; // class FileUtils_T #if defined USE_MPRELAY diff --git a/src/util/io/P2PIOPolicy.h b/src/util/io/P2PIOPolicy.h index b5c38f29..bf4f0ced 100644 --- a/src/util/io/P2PIOPolicy.h +++ b/src/util/io/P2PIOPolicy.h @@ -1,12 +1,12 @@ /* - Definition of P2PIOPolicy class + Definition of P2PIOPolicy class - Author: Benjamin Karl Bergen + Author: Benjamin Karl Bergen - $Revision$ - $LastChangedBy$ - $LastChangedDate$ - vim: set ts=3 : + $Revision$ + $LastChangedBy$ + $LastChangedDate$ + vim: set ts=3 : */ #ifndef P2PIOPolicy_h @@ -16,541 +16,560 @@ #include #include "FileIOData.h" -#include "P2PConnection.h" #include "MPData.h" +#include "P2PConnection.h" #include "swap.h" /*! - \class P2PIOPolicy P2PIOPolicy.h - \brief provides... + \class P2PIOPolicy P2PIOPolicy.h + \brief provides... */ -template +template class P2PIOPolicy - { - public: - - //! Constructor - P2PIOPolicy() - : id_(-1), mode_(io_closed), current_(0), - write_buffer_offset_(0), read_buffer_offset_(0), file_size_(0) - { - pending_[0] = false; - pending_[1] = false; - buffer_fill_[0] = 0; - buffer_fill_[1] = 0; - - io_line_.setId(1); - } - - //! Destructor - ~P2PIOPolicy() {} - - FileIOStatus open(const char * filename, FileIOMode mode); - int32_t close(); - - bool isOpen() { return (id_>=0); } - - int64_t size(); - - void print(const char * format, va_list & args); - - template size_t read(T * data, size_t elements); - template size_t write(const T * data, size_t elements); - - int64_t seek(uint64_t offset, int32_t whence); - int64_t tell(); - void rewind(); - void flush(); - - private: - - template inline void swap_bytes(T * data, size_t elements); - - void send_write_block(uint32_t buffer); - int64_t wait_write_block(uint32_t buffer); - - void request_read_block(uint32_t buffer); - int64_t wait_read_block(uint32_t buffer); - - MPBuffer io_buffer_[2]; - MPBuffer io_line_; - - int32_t id_; - FileIOMode mode_; - uint32_t current_; - uint64_t write_buffer_offset_; - uint64_t read_buffer_offset_; - uint64_t buffer_fill_[2]; - bool pending_[2]; - int request_id[2]; - MPRequest request_[2]; - - int64_t file_size_; - ldiv_t read_blocks_; - - }; // class P2PIOPolicy - -template -FileIOStatus P2PIOPolicy::open(const char * filename, FileIOMode mode) - { - assert(id_<0); - P2PConnection & p2p = P2PConnection::instance(); - - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " opening " << filename << std::endl; - */ - - // this sends the string terminator as well - size_t msg_size = strlen(filename)+1; - MPRequest request; - - // re-initialize some values - write_buffer_offset_ = 0; - read_buffer_offset_ = 0; - pending_[0] = false; - pending_[1] = false; - buffer_fill_[0] = 0; - buffer_fill_[1] = 0; - current_ = 0; - - // file io mode - switch(mode) { - - case io_read: - request.set(P2PTag::io_open_read, P2PTag::data, msg_size); - p2p.post(request); - break; - - case io_read_write: - request.set(P2PTag::io_open_read_write, P2PTag::data, msg_size); - p2p.post(request); - break; - - case io_write: - request.set(P2PTag::io_open_write, P2PTag::data, msg_size); - p2p.post(request); - break; - - case io_write_read: - request.set(P2PTag::io_open_write_read, P2PTag::data, msg_size); - p2p.post(request); - break; - - case io_append: - request.set(P2PTag::io_open_append, - P2PTag::data, msg_size); - p2p.post(request); - break; - - case io_append_read: - request.set(P2PTag::io_open_append_read, - P2PTag::data, msg_size); - p2p.post(request); - break; - - default: - return fail; - - } // switch - - // save this for flush logic - mode_ = mode; - - // send the filename to peer - p2p.send(const_cast(filename), request.count, request.tag); - - // get file descriptor id - p2p.recv(&id_, 1, request.tag, request.id); - assert(id_>=0); - - if(mode_ == io_read || mode_ == io_read_write) { - p2p.recv(&file_size_, 1, request.tag, id_); - read_blocks_ = ldiv(file_size_, io_buffer_size); - - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " file size: " << file_size_ << " read blocks: " << - read_blocks_.quot << std::endl; - */ - - // request block - request_read_block(current_); - - // request next block - // This will only do a real request if - // read_blocks_ > 1 - request_read_block(current_^1); - - // wait on the first block - wait_read_block(current_); - } // if - - return ok; - } // P2PIOPolicy<>::open - -template +{ + public: + //! Constructor + P2PIOPolicy() + : id_( -1 ) + , mode_( io_closed ) + , current_( 0 ) + , write_buffer_offset_( 0 ) + , read_buffer_offset_( 0 ) + , file_size_( 0 ) + { + pending_[0] = false; + pending_[1] = false; + buffer_fill_[0] = 0; + buffer_fill_[1] = 0; + + io_line_.setId( 1 ); + } + + //! Destructor + ~P2PIOPolicy() {} + + FileIOStatus open( const char* filename, FileIOMode mode ); + int32_t close(); + + bool isOpen() { return ( id_ >= 0 ); } + + int64_t size(); + + void print( const char* format, va_list& args ); + + template + size_t read( T* data, size_t elements ); + template + size_t write( const T* data, size_t elements ); + + int64_t seek( uint64_t offset, int32_t whence ); + int64_t tell(); + void rewind(); + void flush(); + + private: + template + inline void swap_bytes( T* data, size_t elements ); + + void send_write_block( uint32_t buffer ); + int64_t wait_write_block( uint32_t buffer ); + + void request_read_block( uint32_t buffer ); + int64_t wait_read_block( uint32_t buffer ); + + MPBuffer io_buffer_[2]; + MPBuffer io_line_; + + int32_t id_; + FileIOMode mode_; + uint32_t current_; + uint64_t write_buffer_offset_; + uint64_t read_buffer_offset_; + uint64_t buffer_fill_[2]; + bool pending_[2]; + int request_id[2]; + MPRequest request_[2]; + + int64_t file_size_; + ldiv_t read_blocks_; + +}; // class P2PIOPolicy + +template +FileIOStatus P2PIOPolicy::open( const char* filename, FileIOMode mode ) +{ + assert( id_ < 0 ); + P2PConnection& p2p = P2PConnection::instance(); + + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " opening " << filename << std::endl; + */ + + // this sends the string terminator as well + size_t msg_size = strlen( filename ) + 1; + MPRequest request; + + // re-initialize some values + write_buffer_offset_ = 0; + read_buffer_offset_ = 0; + pending_[0] = false; + pending_[1] = false; + buffer_fill_[0] = 0; + buffer_fill_[1] = 0; + current_ = 0; + + // file io mode + switch ( mode ) + { + + case io_read: + request.set( P2PTag::io_open_read, P2PTag::data, msg_size ); + p2p.post( request ); + break; + + case io_read_write: + request.set( P2PTag::io_open_read_write, P2PTag::data, msg_size ); + p2p.post( request ); + break; + + case io_write: + request.set( P2PTag::io_open_write, P2PTag::data, msg_size ); + p2p.post( request ); + break; + + case io_write_read: + request.set( P2PTag::io_open_write_read, P2PTag::data, msg_size ); + p2p.post( request ); + break; + + case io_append: + request.set( P2PTag::io_open_append, P2PTag::data, msg_size ); + p2p.post( request ); + break; + + case io_append_read: + request.set( P2PTag::io_open_append_read, P2PTag::data, msg_size ); + p2p.post( request ); + break; + + default: + return fail; + + } // switch + + // save this for flush logic + mode_ = mode; + + // send the filename to peer + p2p.send( const_cast( filename ), request.count, request.tag ); + + // get file descriptor id + p2p.recv( &id_, 1, request.tag, request.id ); + assert( id_ >= 0 ); + + if ( mode_ == io_read || mode_ == io_read_write ) + { + p2p.recv( &file_size_, 1, request.tag, id_ ); + read_blocks_ = ldiv( file_size_, io_buffer_size ); + + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " file size: " << file_size_ << " read blocks: " << + read_blocks_.quot << std::endl; + */ + + // request block + request_read_block( current_ ); + + // request next block + // This will only do a real request if + // read_blocks_ > 1 + request_read_block( current_ ^ 1 ); + + // wait on the first block + wait_read_block( current_ ); + } // if + + return ok; +} // P2PIOPolicy<>::open + +template int32_t P2PIOPolicy::close() - { +{ - P2PConnection & p2p = P2PConnection::instance(); - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " closing file" << std::endl; - */ + P2PConnection& p2p = P2PConnection::instance(); + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " closing file" << std::endl; + */ - // force write if current block hasn't been written - if(write_buffer_offset_ > 0) { - flush(); - } // if + // force write if current block hasn't been written + if ( write_buffer_offset_ > 0 ) + { + flush(); + } // if - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " after flush" << std::endl; - */ + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " after flush" << std::endl; + */ - MPRequest request(P2PTag::io_close, P2PTag::data, 1, id_); - p2p.post(request); + MPRequest request( P2PTag::io_close, P2PTag::data, 1, id_ ); + p2p.post( request ); - int32_t status(0); - p2p.recv(&status, 1, request.tag, request.id); + int32_t status( 0 ); + p2p.recv( &status, 1, request.tag, request.id ); - id_ = -1; + id_ = -1; - return status; - } // P2PIOPolicy<>::close + return status; +} // P2PIOPolicy<>::close -template +template int64_t P2PIOPolicy::size() - { - assert(id_>=0); - - P2PConnection & p2p = P2PConnection::instance(); - - MPRequest request(P2PTag::io_size, P2PTag::data, 1, id_); - P2PConnection::instance().post(request); - - int64_t tmp; - p2p.recv(&tmp, 1, request.tag, request.id); - - return tmp; - } // P2PIOPolicy<>::size - -template -void P2PIOPolicy::print(const char * format, va_list & args) - { - assert(id_>=0); - - // sprintf to local buffer - vsprintf(io_line_.data(), format, args); - - /* - P2PConnection & p2p = P2PConnection::instance(); - std::cerr << "PPE rank: " << p2p.global_id() << - " printing " << io_line_.data() << std::endl; - */ - - // use write function to do actual work - P2PIOPolicy::write(io_line_.data(), strlen(io_line_.data())); - } // P2PIOPolicy<>::print - -template -template -size_t P2PIOPolicy::read(T * data, size_t elements) - { - assert(id_>=0); - - // everything is done in bytes - uint64_t bytes = elements*sizeof(T); - char * bdata = reinterpret_cast(data); - uint64_t bdata_offset(0); - uint64_t read_bytes(0); - - do { - const int64_t over_run = (read_buffer_offset_ + bytes) - - buffer_fill_[current_]; - - if(over_run > 0) { - const uint64_t under_run = - buffer_fill_[current_] - read_buffer_offset_; - - // copy remainder of current buffer to data - memcpy(bdata + bdata_offset, - io_buffer_[current_].data() + read_buffer_offset_, - under_run); - bdata_offset += under_run; - bytes -= under_run; - - // re-fill current buffer - request_read_block(current_); - current_^=1; - if(pending_[current_]) { - read_bytes += wait_read_block(current_); - } // if - read_buffer_offset_ = 0; - } - else { - memcpy(bdata + bdata_offset, - io_buffer_[current_].data() + read_buffer_offset_, bytes); - read_buffer_offset_ += bytes; - bytes = 0; - } // if - } while(bytes > 0); - - // this will only do something if - // this class was instantiated as P2PIOPolicy - swap_bytes(data, elements); - - return static_cast(read_bytes); - } // P2PIOPolicy<>::read - -template -template -size_t P2PIOPolicy::write(const T * data, size_t elements) - { - assert(id_>=0); - assert(mode_ != io_read); - - // book-keeping is done in bytes - uint64_t bytes(elements*sizeof(T)); - const char * bdata = reinterpret_cast(data); - uint64_t bdata_offset(0); - uint64_t write_bytes(0); - - /* - P2PConnection & p2p = P2PConnection::instance(); - std::cerr << "PPE rank: " << p2p.global_id() << - " bytes " << bytes << std::endl; - */ - do { - const int64_t over_run = (write_buffer_offset_ + bytes) - - io_buffer_[current_].size(); - - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " over_run " << over_run << std::endl; - */ - if(over_run > 0) { - const uint64_t under_run = - io_buffer_[current_].size() - write_buffer_offset_; - - // because of the possiblity of byte swapping - // we need to make sure that only even multiples - // of the type are copied at once - const uint64_t copy_bytes = (under_run/sizeof(T))*sizeof(T); - - /* - printf("PPE rank: %d dst %p src %p bytes %ld\n", - p2p.global_id(), - io_buffer_[current_].data() + write_buffer_offset_, - bdata + bdata_offset, copy_bytes); - */ - - memcpy(io_buffer_[current_].data() + write_buffer_offset_, - bdata + bdata_offset, copy_bytes); - - // need to force type here to get correct swapping - swap_bytes(reinterpret_cast - (io_buffer_[current_].data() + write_buffer_offset_), - copy_bytes/sizeof(T)); - - bdata_offset += copy_bytes; - bytes -= copy_bytes; - - request_[current_].set(P2PTag::io_write, P2PTag::data, - write_buffer_offset_ + copy_bytes, id_); - send_write_block(current_); - current_^=1; - if(pending_[current_]) { - write_bytes += wait_write_block(current_); - } // if - write_buffer_offset_ = 0; - } - else { - /* - printf("PPE rank: %d dst %p src %p bytes %ld\n", - p2p.global_id(), - io_buffer_[current_].data() + write_buffer_offset_, - bdata + bdata_offset, bytes); - */ - - memcpy(io_buffer_[current_].data() + write_buffer_offset_, - bdata + bdata_offset, bytes); - - // need to force type here to get correct swapping - swap_bytes(reinterpret_cast - (io_buffer_[current_].data() + write_buffer_offset_), - bytes/sizeof(T)); - - write_buffer_offset_ += bytes; - bytes = 0; - } // if - } while(bytes > 0); - - return static_cast(write_bytes); - } // P2PIOPolicy<>::write - -template -int64_t P2PIOPolicy::seek(uint64_t offset, int32_t whence) - { - assert(id_>=0); - - MPRequest request(P2PTag::io_seek, P2PTag::data, 0, id_); - P2PConnection & p2p = P2PConnection::instance(); - - p2p.post(request); - p2p.send(&offset, 1, P2PTag::data); - p2p.send(&whence, 1, P2PTag::data); - - //FIXME: need real return - return 0; - } // P2PIOPolicy<>::seek - -template +{ + assert( id_ >= 0 ); + + P2PConnection& p2p = P2PConnection::instance(); + + MPRequest request( P2PTag::io_size, P2PTag::data, 1, id_ ); + P2PConnection::instance().post( request ); + + int64_t tmp; + p2p.recv( &tmp, 1, request.tag, request.id ); + + return tmp; +} // P2PIOPolicy<>::size + +template +void P2PIOPolicy::print( const char* format, va_list& args ) +{ + assert( id_ >= 0 ); + + // sprintf to local buffer + vsprintf( io_line_.data(), format, args ); + + /* + P2PConnection & p2p = P2PConnection::instance(); + std::cerr << "PPE rank: " << p2p.global_id() << + " printing " << io_line_.data() << std::endl; + */ + + // use write function to do actual work + P2PIOPolicy::write( io_line_.data(), strlen( io_line_.data() ) ); +} // P2PIOPolicy<>::print + +template +template +size_t P2PIOPolicy::read( T* data, size_t elements ) +{ + assert( id_ >= 0 ); + + // everything is done in bytes + uint64_t bytes = elements * sizeof( T ); + char* bdata = reinterpret_cast( data ); + uint64_t bdata_offset( 0 ); + uint64_t read_bytes( 0 ); + + do + { + const int64_t over_run = + ( read_buffer_offset_ + bytes ) - buffer_fill_[current_]; + + if ( over_run > 0 ) + { + const uint64_t under_run = + buffer_fill_[current_] - read_buffer_offset_; + + // copy remainder of current buffer to data + memcpy( bdata + bdata_offset, + io_buffer_[current_].data() + read_buffer_offset_, + under_run ); + bdata_offset += under_run; + bytes -= under_run; + + // re-fill current buffer + request_read_block( current_ ); + current_ ^= 1; + if ( pending_[current_] ) + { + read_bytes += wait_read_block( current_ ); + } // if + read_buffer_offset_ = 0; + } + else + { + memcpy( bdata + bdata_offset, + io_buffer_[current_].data() + read_buffer_offset_, bytes ); + read_buffer_offset_ += bytes; + bytes = 0; + } // if + } while ( bytes > 0 ); + + // this will only do something if + // this class was instantiated as P2PIOPolicy + swap_bytes( data, elements ); + + return static_cast( read_bytes ); +} // P2PIOPolicy<>::read + +template +template +size_t P2PIOPolicy::write( const T* data, size_t elements ) +{ + assert( id_ >= 0 ); + assert( mode_ != io_read ); + + // book-keeping is done in bytes + uint64_t bytes( elements * sizeof( T ) ); + const char* bdata = reinterpret_cast( data ); + uint64_t bdata_offset( 0 ); + uint64_t write_bytes( 0 ); + + /* + P2PConnection & p2p = P2PConnection::instance(); + std::cerr << "PPE rank: " << p2p.global_id() << + " bytes " << bytes << std::endl; + */ + do + { + const int64_t over_run = + ( write_buffer_offset_ + bytes ) - io_buffer_[current_].size(); + + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " over_run " << over_run << std::endl; + */ + if ( over_run > 0 ) + { + const uint64_t under_run = + io_buffer_[current_].size() - write_buffer_offset_; + + // because of the possiblity of byte swapping + // we need to make sure that only even multiples + // of the type are copied at once + const uint64_t copy_bytes = + ( under_run / sizeof( T ) ) * sizeof( T ); + + /* + printf("PPE rank: %d dst %p src %p bytes %ld\n", + p2p.global_id(), + io_buffer_[current_].data() + write_buffer_offset_, + bdata + bdata_offset, copy_bytes); + */ + + memcpy( io_buffer_[current_].data() + write_buffer_offset_, + bdata + bdata_offset, copy_bytes ); + + // need to force type here to get correct swapping + swap_bytes( reinterpret_cast( io_buffer_[current_].data() + + write_buffer_offset_ ), + copy_bytes / sizeof( T ) ); + + bdata_offset += copy_bytes; + bytes -= copy_bytes; + + request_[current_].set( P2PTag::io_write, P2PTag::data, + write_buffer_offset_ + copy_bytes, id_ ); + send_write_block( current_ ); + current_ ^= 1; + if ( pending_[current_] ) + { + write_bytes += wait_write_block( current_ ); + } // if + write_buffer_offset_ = 0; + } + else + { + /* + printf("PPE rank: %d dst %p src %p bytes %ld\n", + p2p.global_id(), + io_buffer_[current_].data() + write_buffer_offset_, + bdata + bdata_offset, bytes); + */ + + memcpy( io_buffer_[current_].data() + write_buffer_offset_, + bdata + bdata_offset, bytes ); + + // need to force type here to get correct swapping + swap_bytes( reinterpret_cast( io_buffer_[current_].data() + + write_buffer_offset_ ), + bytes / sizeof( T ) ); + + write_buffer_offset_ += bytes; + bytes = 0; + } // if + } while ( bytes > 0 ); + + return static_cast( write_bytes ); +} // P2PIOPolicy<>::write + +template +int64_t P2PIOPolicy::seek( uint64_t offset, int32_t whence ) +{ + assert( id_ >= 0 ); + + MPRequest request( P2PTag::io_seek, P2PTag::data, 0, id_ ); + P2PConnection& p2p = P2PConnection::instance(); + + p2p.post( request ); + p2p.send( &offset, 1, P2PTag::data ); + p2p.send( &whence, 1, P2PTag::data ); + + // FIXME: need real return + return 0; +} // P2PIOPolicy<>::seek + +template int64_t P2PIOPolicy::tell() - { - assert(id_>=0); +{ + assert( id_ >= 0 ); - MPRequest request(P2PTag::io_tell, P2PTag::data, 0, id_); - P2PConnection & p2p = P2PConnection::instance(); + MPRequest request( P2PTag::io_tell, P2PTag::data, 0, id_ ); + P2PConnection& p2p = P2PConnection::instance(); - p2p.post(request); + p2p.post( request ); - int64_t offset; - p2p.recv(&offset, 1, request.tag, request.id); + int64_t offset; + p2p.recv( &offset, 1, request.tag, request.id ); - return offset; - } // P2PIOPolicy<>::tell + return offset; +} // P2PIOPolicy<>::tell -template +template void P2PIOPolicy::rewind() - { - assert(id_>=0); - - MPRequest request(P2PTag::io_rewind, P2PTag::data, 0, id_); - P2PConnection & p2p = P2PConnection::instance(); - - p2p.post(request); - } // P2PIOPolicy<>::rewind - -template -void P2PIOPolicy::request_read_block(uint32_t buffer) - { - P2PConnection & p2p = P2PConnection::instance(); - - if(read_blocks_.quot > 0) { - request_[buffer].set(P2PTag::io_read, P2PTag::data, - io_buffer_[buffer].size(), id_); - - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " requesting " << request_[buffer].count << - " bytes " << std::endl; - */ - - p2p.post(request_[buffer]); - p2p.irecv(io_buffer_[buffer].data(), request_[buffer].count, - request_[buffer].tag, request_[buffer].id); - pending_[buffer] = true; - buffer_fill_[buffer] = request_[buffer].count; - - read_blocks_.quot--; - } - else if(read_blocks_.rem > 0) { - request_[buffer].set(P2PTag::io_read, P2PTag::data, - read_blocks_.rem, id_); - - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " requesting " << request_[buffer].count << - " bytes " << std::endl; - */ - - p2p.post(request_[buffer]); - p2p.irecv(io_buffer_[buffer].data(), request_[buffer].count, - request_[buffer].tag, request_[buffer].id); - pending_[buffer] = true; - buffer_fill_[buffer] = request_[buffer].count; - - read_blocks_.rem = 0; - } // if - } // P2PIOPolicy<>::request_read_block - -template -int64_t P2PIOPolicy::wait_read_block(uint32_t buffer) - { - P2PConnection & p2p = P2PConnection::instance(); - p2p.wait_recv(request_[buffer].id); - pending_[buffer] = false; - return request_[buffer].count; - } // P2PIOPolicy<>::wait_read_block - -template -void P2PIOPolicy::send_write_block(uint32_t buffer) - { - P2PConnection & p2p = P2PConnection::instance(); - - p2p.post(request_[buffer]); - /* - std::cerr << "PPE rank: " << p2p.global_id() << - " sending " << request_[buffer].count << std::endl; - */ - p2p.isend(io_buffer_[buffer].data(), request_[buffer].count, - request_[buffer].tag, request_[buffer].id); - pending_[buffer] = true; - } // P2PIOPolicy<>::send_write_block - -template -int64_t P2PIOPolicy::wait_write_block(uint32_t buffer) - { - P2PConnection & p2p = P2PConnection::instance(); - p2p.wait_send(request_[buffer].id); - pending_[buffer] = false; - return request_[buffer].count; - } // P2PIOPolicy<>::wait_write_block - -template +{ + assert( id_ >= 0 ); + + MPRequest request( P2PTag::io_rewind, P2PTag::data, 0, id_ ); + P2PConnection& p2p = P2PConnection::instance(); + + p2p.post( request ); +} // P2PIOPolicy<>::rewind + +template +void P2PIOPolicy::request_read_block( uint32_t buffer ) +{ + P2PConnection& p2p = P2PConnection::instance(); + + if ( read_blocks_.quot > 0 ) + { + request_[buffer].set( P2PTag::io_read, P2PTag::data, + io_buffer_[buffer].size(), id_ ); + + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " requesting " << request_[buffer].count << + " bytes " << std::endl; + */ + + p2p.post( request_[buffer] ); + p2p.irecv( io_buffer_[buffer].data(), request_[buffer].count, + request_[buffer].tag, request_[buffer].id ); + pending_[buffer] = true; + buffer_fill_[buffer] = request_[buffer].count; + + read_blocks_.quot--; + } + else if ( read_blocks_.rem > 0 ) + { + request_[buffer].set( P2PTag::io_read, P2PTag::data, read_blocks_.rem, + id_ ); + + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " requesting " << request_[buffer].count << + " bytes " << std::endl; + */ + + p2p.post( request_[buffer] ); + p2p.irecv( io_buffer_[buffer].data(), request_[buffer].count, + request_[buffer].tag, request_[buffer].id ); + pending_[buffer] = true; + buffer_fill_[buffer] = request_[buffer].count; + + read_blocks_.rem = 0; + } // if +} // P2PIOPolicy<>::request_read_block + +template +int64_t P2PIOPolicy::wait_read_block( uint32_t buffer ) +{ + P2PConnection& p2p = P2PConnection::instance(); + p2p.wait_recv( request_[buffer].id ); + pending_[buffer] = false; + return request_[buffer].count; +} // P2PIOPolicy<>::wait_read_block + +template +void P2PIOPolicy::send_write_block( uint32_t buffer ) +{ + P2PConnection& p2p = P2PConnection::instance(); + + p2p.post( request_[buffer] ); + /* + std::cerr << "PPE rank: " << p2p.global_id() << + " sending " << request_[buffer].count << std::endl; + */ + p2p.isend( io_buffer_[buffer].data(), request_[buffer].count, + request_[buffer].tag, request_[buffer].id ); + pending_[buffer] = true; +} // P2PIOPolicy<>::send_write_block + +template +int64_t P2PIOPolicy::wait_write_block( uint32_t buffer ) +{ + P2PConnection& p2p = P2PConnection::instance(); + p2p.wait_send( request_[buffer].id ); + pending_[buffer] = false; + return request_[buffer].count; +} // P2PIOPolicy<>::wait_write_block + +template void P2PIOPolicy::flush() - { - /* - P2PConnection & p2p = P2PConnection::instance(); - std::cerr << "PPE rank: " << p2p.global_id() << - " write_buffer_offset_: " << write_buffer_offset_ << std::endl; - */ - - // check to see if we need to flush the current buffer - if(write_buffer_offset_ > 0) { - request_[current_].set(P2PTag::io_write, P2PTag::data, - write_buffer_offset_, id_); - send_write_block(current_); - - current_ ^= 1; - } // if - - - // wait on the other buffer if it is in flight - if(pending_[current_]) { - wait_write_block(current_); - } // if - - // wait on the one we just sent - current_ ^= 1; - wait_write_block(current_); - } // P2PIOPolicy<>::flush - -template<> -template inline -void P2PIOPolicy::swap_bytes(T * data, size_t elements) - { - for(size_t i(0); i::swap_bytes - -template<> -template inline -void P2PIOPolicy::swap_bytes(T * data, size_t elements) - { - } // P2PIOPolicy<>::swap_bytes +{ + /* + P2PConnection & p2p = P2PConnection::instance(); + std::cerr << "PPE rank: " << p2p.global_id() << + " write_buffer_offset_: " << write_buffer_offset_ << std::endl; + */ + + // check to see if we need to flush the current buffer + if ( write_buffer_offset_ > 0 ) + { + request_[current_].set( P2PTag::io_write, P2PTag::data, + write_buffer_offset_, id_ ); + send_write_block( current_ ); + + current_ ^= 1; + } // if + + // wait on the other buffer if it is in flight + if ( pending_[current_] ) + { + wait_write_block( current_ ); + } // if + + // wait on the one we just sent + current_ ^= 1; + wait_write_block( current_ ); +} // P2PIOPolicy<>::flush + +template <> +template +inline void P2PIOPolicy::swap_bytes( T* data, size_t elements ) +{ + for ( size_t i( 0 ); i < elements; i++ ) + { + utils::swap( data[i] ); + } // for +} // P2PIOPolicy<>::swap_bytes + +template <> +template +inline void P2PIOPolicy::swap_bytes( T* data, size_t elements ) +{ +} // P2PIOPolicy<>::swap_bytes #endif // P2PIOPolicy_h diff --git a/src/util/io/P2PUtilsPolicy.h b/src/util/io/P2PUtilsPolicy.h index a3d31264..2101ccc9 100644 --- a/src/util/io/P2PUtilsPolicy.h +++ b/src/util/io/P2PUtilsPolicy.h @@ -4,48 +4,46 @@ #include "P2PConnection.h" class P2PUtilsPolicy - { - public: +{ + public: + P2PUtilsPolicy() {} + ~P2PUtilsPolicy() {} - P2PUtilsPolicy() {} - ~P2PUtilsPolicy() {} + static int makeDirectory( const char* dirname ); + static int getCurrentWorkingDirectory( char* dirname, size_t size ); - static int makeDirectory(const char * dirname); - static int getCurrentWorkingDirectory(char * dirname, size_t size); + private: +}; // class P2PUtilsPolicy - private: +inline int P2PUtilsPolicy::makeDirectory( const char* dirname ) +{ + P2PConnection& p2p = P2PConnection::instance(); - }; // class P2PUtilsPolicy + size_t msg_size = strlen( dirname ) + 1; + int retval; + MPRequest request( P2PTag::utils_mkdir, P2PTag::data, msg_size ); -inline int P2PUtilsPolicy::makeDirectory(const char * dirname) - { - P2PConnection & p2p = P2PConnection::instance(); + p2p.post( request ); + p2p.send( const_cast( dirname ), request.count, request.tag ); + p2p.recv( &retval, 1, request.tag, request.id ); - size_t msg_size = strlen(dirname)+1; - int retval; - MPRequest request(P2PTag::utils_mkdir, P2PTag::data, msg_size); + return retval; +} // P2PUtilsPolicy::makeDirectory - p2p.post(request); - p2p.send(const_cast(dirname), request.count, request.tag); - p2p.recv(&retval, 1, request.tag, request.id); +inline int P2PUtilsPolicy::getCurrentWorkingDirectory( char* dirname, + size_t size ) +{ + P2PConnection& p2p = P2PConnection::instance(); - return retval; - } // P2PUtilsPolicy::makeDirectory + size_t msg_size = size; + int retval; + MPRequest request( P2PTag::utils_mkdir, P2PTag::data, msg_size ); -inline int P2PUtilsPolicy::getCurrentWorkingDirectory(char * dirname, - size_t size) - { - P2PConnection & p2p = P2PConnection::instance(); + p2p.post( request ); + p2p.recv( dirname, size, request.count, request.tag ); + p2p.recv( &retval, 1, request.tag, request.id ); - size_t msg_size = size; - int retval; - MPRequest request(P2PTag::utils_mkdir, P2PTag::data, msg_size); - - p2p.post(request); - p2p.recv(dirname, size, request.count, request.tag); - p2p.recv(&retval, 1, request.tag, request.id); - - return retval; - } // P2PUtilsPolicy::getCurrentWorkingDirectory + return retval; +} // P2PUtilsPolicy::getCurrentWorkingDirectory #endif // P2PUtilsPolicy_h diff --git a/src/util/io/StandardIOPolicy.h b/src/util/io/StandardIOPolicy.h index 15c15401..33210ae2 100644 --- a/src/util/io/StandardIOPolicy.h +++ b/src/util/io/StandardIOPolicy.h @@ -1,166 +1,171 @@ /* - Definition of StandardIOPolicy class + Definition of StandardIOPolicy class - Author: Benjamin Karl Bergen + Author: Benjamin Karl Bergen - $Revision$ - $LastChangedBy$ - $LastChangedDate$ - vim: set ts=3 : + $Revision$ + $LastChangedBy$ + $LastChangedDate$ + vim: set ts=3 : */ #ifndef StandardIOPolicy_h #define StandardIOPolicy_h -#include #include #include +#include #include "FileIOData.h" /*! - \class StandardIOPolicy StandardIOPolicy.h - \brief provides... + \class StandardIOPolicy StandardIOPolicy.h + \brief provides... */ class StandardIOPolicy - { - public: +{ + public: + //! Constructor + StandardIOPolicy() + : handle_( nullptr ) + { + } - //! Constructor - StandardIOPolicy() : handle_(nullptr) {} + //! Destructor + ~StandardIOPolicy() {} - //! Destructor - ~StandardIOPolicy() {} + // open/close methods + FileIOStatus open( const char* filename, FileIOMode mode ); + int32_t close(); - // open/close methods - FileIOStatus open(const char * filename, FileIOMode mode); - int32_t close(); - - bool isOpen() { return is_open_; } + bool isOpen() { return is_open_; } - // return file size in bytes - int64_t size(); + // return file size in bytes + int64_t size(); - // ascii methods - void print(const char * format, va_list & args); + // ascii methods + void print( const char* format, va_list& args ); - // binary methods - template size_t read(T * data, size_t elements); - template size_t write(const T * data, size_t elements); + // binary methods + template + size_t read( T* data, size_t elements ); + template + size_t write( const T* data, size_t elements ); - int64_t seek(uint64_t offset, int32_t whence); - int64_t tell(); - void rewind(); - void flush(); + int64_t seek( uint64_t offset, int32_t whence ); + int64_t tell(); + void rewind(); + void flush(); - private: + private: + bool is_open_; + FILE* handle_; - bool is_open_; - FILE * handle_; +}; // class StandardIOPolicy - }; // class StandardIOPolicy +inline FileIOStatus StandardIOPolicy::open( const char* filename, + FileIOMode mode ) +{ + handle_ = nullptr; -inline FileIOStatus -StandardIOPolicy::open(const char * filename, FileIOMode mode) - { - handle_ = nullptr; + switch ( mode ) + { - switch(mode) { - - case io_read: - handle_ = fopen(filename, "r"); - break; + case io_read: + handle_ = fopen( filename, "r" ); + break; - case io_read_write: - handle_ = fopen(filename, "r+"); - break; + case io_read_write: + handle_ = fopen( filename, "r+" ); + break; - case io_write: - handle_ = fopen(filename, "w"); - break; + case io_write: + handle_ = fopen( filename, "w" ); + break; - case io_write_read: - handle_ = fopen(filename, "w+"); - break; + case io_write_read: + handle_ = fopen( filename, "w+" ); + break; - case io_append: - handle_ = fopen(filename, "a"); - break; + case io_append: + handle_ = fopen( filename, "a" ); + break; - case io_append_read: - handle_ = fopen(filename, "a+"); - break; + case io_append_read: + handle_ = fopen( filename, "a+" ); + break; - default: - return fail; + default: + return fail; - } // switch + } // switch - if(handle_ == nullptr) { - return fail; - } // if + if ( handle_ == nullptr ) + { + return fail; + } // if - is_open_ = true; - return ok; - } // StandardIOPolicy::StandardIOPolicy + is_open_ = true; + return ok; +} // StandardIOPolicy::StandardIOPolicy inline int32_t StandardIOPolicy::close() - { - int32_t status = fclose(handle_); - is_open_ = false; - return status; - } // StandardIOPolicy::~StandardIOPolicy +{ + int32_t status = fclose( handle_ ); + is_open_ = false; + return status; +} // StandardIOPolicy::~StandardIOPolicy inline int64_t StandardIOPolicy::size() - { - int64_t current = ftell(handle_); - fseek(handle_, 0L, SEEK_END); - int64_t size = ftell(handle_); - fseek(handle_, current, SEEK_SET); - return size; - } // StandardIOPolicy::size - -inline void StandardIOPolicy::print(const char * format, va_list & args) - { - // print to file - vfprintf(handle_, format, args); - - // end list - va_end(args); - } // StandardIOPolicy::print - -template -inline size_t StandardIOPolicy::read(T * data, size_t elements) - { - return fread(reinterpret_cast(data), sizeof(T), - elements, handle_); - } // StandardIOPolicy::read - -template -inline size_t StandardIOPolicy::write(const T * data, size_t elements) - { - return fwrite(reinterpret_cast(const_cast(data)), - sizeof(T), elements, handle_); - } // StandardIOPolicy::write - -inline int64_t StandardIOPolicy::seek(uint64_t offset, int32_t whence) - { - return fseek(handle_, offset, whence); - } // StandardIOPolicy::seek +{ + int64_t current = ftell( handle_ ); + fseek( handle_, 0L, SEEK_END ); + int64_t size = ftell( handle_ ); + fseek( handle_, current, SEEK_SET ); + return size; +} // StandardIOPolicy::size + +inline void StandardIOPolicy::print( const char* format, va_list& args ) +{ + // print to file + vfprintf( handle_, format, args ); + + // end list + va_end( args ); +} // StandardIOPolicy::print + +template +inline size_t StandardIOPolicy::read( T* data, size_t elements ) +{ + return fread( reinterpret_cast( data ), sizeof( T ), elements, + handle_ ); +} // StandardIOPolicy::read + +template +inline size_t StandardIOPolicy::write( const T* data, size_t elements ) +{ + return fwrite( reinterpret_cast( const_cast( data ) ), + sizeof( T ), elements, handle_ ); +} // StandardIOPolicy::write + +inline int64_t StandardIOPolicy::seek( uint64_t offset, int32_t whence ) +{ + return fseek( handle_, offset, whence ); +} // StandardIOPolicy::seek inline int64_t StandardIOPolicy::tell() - { - return int64_t(ftell(handle_)); - } // StandardIOPolicy::tell +{ + return int64_t( ftell( handle_ ) ); +} // StandardIOPolicy::tell inline void StandardIOPolicy::rewind() - { - StandardIOPolicy::seek(uint64_t(0), SEEK_SET); - } // StandardIOPolicy::rewind +{ + StandardIOPolicy::seek( uint64_t( 0 ), SEEK_SET ); +} // StandardIOPolicy::rewind inline void StandardIOPolicy::flush() - { - fflush(handle_); - } // StandardIOPolicy::rewind +{ + fflush( handle_ ); +} // StandardIOPolicy::rewind #endif // StandardIOPolicy_h diff --git a/src/util/io/StandardUtilsPolicy.h b/src/util/io/StandardUtilsPolicy.h index 34108459..0d1d99c2 100644 --- a/src/util/io/StandardUtilsPolicy.h +++ b/src/util/io/StandardUtilsPolicy.h @@ -1,31 +1,33 @@ #ifndef StandardUtilsPolicy_h #define StandardUtilsPolicy_h -#include -#include +#include +#include class StandardUtilsPolicy - { - public: - - StandardUtilsPolicy() {} - ~StandardUtilsPolicy() {} - - static int makeDirectory(const char * dirname) { - return mkdir(dirname, S_IRWXU); - } // makeDirectory - - static int getCurrentWorkingDirectory(char * dirname, size_t size) { - if(getcwd(dirname, size) == NULL) { - return -1; - } - else { - return 0; - } - } // getCurrentWorkingDirectory - - private: - - }; // class StandardUtilsPolicy +{ + public: + StandardUtilsPolicy() {} + ~StandardUtilsPolicy() {} + + static int makeDirectory( const char* dirname ) + { + return mkdir( dirname, S_IRWXU ); + } // makeDirectory + + static int getCurrentWorkingDirectory( char* dirname, size_t size ) + { + if ( getcwd( dirname, size ) == NULL ) + { + return -1; + } + else + { + return 0; + } + } // getCurrentWorkingDirectory + + private: +}; // class StandardUtilsPolicy #endif // StandardUtilsPolicy_h diff --git a/src/util/mp/DMPPolicy.h b/src/util/mp/DMPPolicy.h index 535e14b6..86760874 100644 --- a/src/util/mp/DMPPolicy.h +++ b/src/util/mp/DMPPolicy.h @@ -1,34 +1,39 @@ #ifndef DMPPolicy_h #define DMPPolicy_h -#include -#include #include +#include #include "../checkpt/checkpt.h" /* Define this comm and mp opaque handles */ /* FIXME: PARENT, COLOR AND KEY ARE FOR FUTURE EXPANSION */ -struct collective { - collective_t * parent; - int color; - int key; - MPI_Comm comm; +struct collective +{ + collective_t* parent; + int color; + int key; + MPI_Comm comm; }; -struct mp { - int n_port; - char * ALIGNED(128) * rbuf; char * ALIGNED(128) * sbuf; - int * rbuf_sz; int * sbuf_sz; - int * rreq_sz; int * sreq_sz; - MPI_Request * rreq; MPI_Request * sreq; +struct mp +{ + int n_port; + char* ALIGNED( 128 ) * rbuf; + char* ALIGNED( 128 ) * sbuf; + int* rbuf_sz; + int* sbuf_sz; + int* rreq_sz; + int* sreq_sz; + MPI_Request* rreq; + MPI_Request* sreq; }; /* Create the world collective */ -static collective_t __world = { NULL, 0, 0, MPI_COMM_SELF }; -collective_t * _world = &__world; +static collective_t __world = {NULL, 0, 0, MPI_COMM_SELF}; +collective_t* _world = &__world; int _world_rank = 0; int _world_size = 1; @@ -36,316 +41,338 @@ int _world_size = 1; /* FIXME: SINCE RIGHT NOW, THERE IS ONLY THE WORLD COLLECTIVE AND NO WAY TO CREATE CHILDREN COLLECTIVES, THIS IS BASICALLY A PLACEHOLDER. */ -void -checkpt_collective( const collective_t * comm ) { - CHECKPT_VAL( int, world_rank ); - CHECKPT_VAL( int, world_size ); +void checkpt_collective( const collective_t* comm ) +{ + CHECKPT_VAL( int, world_rank ); + CHECKPT_VAL( int, world_size ); } -collective_t * -restore_collective( void ) { - int rank, size; - RESTORE_VAL( int, rank ); - RESTORE_VAL( int, size ); - if( size!=world_size ) - ERROR(( "The number of nodes that made this checkpt (%i) is different " - "from the number of nodes currently (%i)", - size, world_size )); - if( rank!=world_rank ) - ERROR(( "This node (%i) is reading a checkpoint previously written by " - "a different node (%i).", rank, world_rank )); - return world; +collective_t* restore_collective( void ) +{ + int rank, size; + RESTORE_VAL( int, rank ); + RESTORE_VAL( int, size ); + if ( size != world_size ) + ERROR( ( "The number of nodes that made this checkpt (%i) is different " + "from the number of nodes currently (%i)", + size, world_size ) ); + if ( rank != world_rank ) + ERROR( ( "This node (%i) is reading a checkpoint previously written by " + "a different node (%i).", + rank, world_rank ) ); + return world; } /* mp checkpointer */ -void -checkpt_mp( mp_t * mp ) { - int port; - CHECKPT( mp, 1 ); - CHECKPT( mp->rbuf, mp->n_port ); CHECKPT( mp->sbuf, mp->n_port ); - CHECKPT( mp->rbuf_sz, mp->n_port ); CHECKPT( mp->sbuf_sz, mp->n_port ); - CHECKPT( mp->rreq_sz, mp->n_port ); CHECKPT( mp->sreq_sz, mp->n_port ); - CHECKPT( mp->rreq, mp->n_port ); CHECKPT( mp->sreq, mp->n_port ); - for( port=0; portn_port; port++ ) { - CHECKPT_ALIGNED( mp->rbuf[port], mp->rbuf_sz[port], 128 ); - CHECKPT_ALIGNED( mp->sbuf[port], mp->sbuf_sz[port], 128 ); - } +void checkpt_mp( mp_t* mp ) +{ + int port; + CHECKPT( mp, 1 ); + CHECKPT( mp->rbuf, mp->n_port ); + CHECKPT( mp->sbuf, mp->n_port ); + CHECKPT( mp->rbuf_sz, mp->n_port ); + CHECKPT( mp->sbuf_sz, mp->n_port ); + CHECKPT( mp->rreq_sz, mp->n_port ); + CHECKPT( mp->sreq_sz, mp->n_port ); + CHECKPT( mp->rreq, mp->n_port ); + CHECKPT( mp->sreq, mp->n_port ); + for ( port = 0; port < mp->n_port; port++ ) + { + CHECKPT_ALIGNED( mp->rbuf[port], mp->rbuf_sz[port], 128 ); + CHECKPT_ALIGNED( mp->sbuf[port], mp->sbuf_sz[port], 128 ); + } } -mp_t * -restore_mp( void ) { - mp_t * mp; - int port; - RESTORE( mp ); - RESTORE( mp->rbuf ); RESTORE( mp->sbuf ); - RESTORE( mp->rbuf_sz ); RESTORE( mp->sbuf_sz ); - RESTORE( mp->rreq_sz ); RESTORE( mp->sreq_sz ); - RESTORE( mp->rreq ); RESTORE( mp->sreq ); - for( port=0; portn_port; port++ ) { - RESTORE_ALIGNED( mp->rbuf[port] ); - RESTORE_ALIGNED( mp->sbuf[port] ); - } - return mp; +mp_t* restore_mp( void ) +{ + mp_t* mp; + int port; + RESTORE( mp ); + RESTORE( mp->rbuf ); + RESTORE( mp->sbuf ); + RESTORE( mp->rbuf_sz ); + RESTORE( mp->sbuf_sz ); + RESTORE( mp->rreq_sz ); + RESTORE( mp->sreq_sz ); + RESTORE( mp->rreq ); + RESTORE( mp->sreq ); + for ( port = 0; port < mp->n_port; port++ ) + { + RESTORE_ALIGNED( mp->rbuf[port] ); + RESTORE_ALIGNED( mp->sbuf[port] ); + } + return mp; } -struct DMPPolicy { - - // FIXME-KJB: The whole sizing process in here is kinda silly and should - // be removed in the long haul. - -# define RESIZE_FACTOR 1.3125 -# define TRAP( x ) do { \ - int ierr = (x); \ - if( ierr!=MPI_SUCCESS ) ERROR(( "MPI error %i on "#x, ierr )); \ - } while(0) - - inline void - boot_mp( int * pargc, - char *** pargv ) { - TRAP( MPI_Init( pargc, pargv ) ); - TRAP( MPI_Comm_dup( MPI_COMM_WORLD, &__world.comm ) ); - __world.parent = NULL, __world.color = 0, __world.key = 0; - TRAP( MPI_Comm_rank( __world.comm, &_world_rank ) ); - TRAP( MPI_Comm_size( __world.comm, &_world_size ) ); - REGISTER_OBJECT( &__world, checkpt_collective, restore_collective, NULL ); - } - - inline void - halt_mp( void ) { - UNREGISTER_OBJECT( &__world ); - TRAP( MPI_Comm_free( &__world.comm ) ); - __world.parent = NULL, __world.color = 0, __world.key = 0; - __world.comm = MPI_COMM_SELF; - _world_size = 1; - _world_rank = 0; - TRAP( MPI_Finalize() ); - } - - inline void - mp_abort( int reason ) { - MPI_Abort( world->comm, reason ); - } - - inline void - mp_barrier( void ) { - TRAP( MPI_Barrier( world->comm ) ); - } - - inline void - mp_allsum_d( double * local, - double * global, - int n ) { - if( !local || !global || n<1 || std::abs(local-global)comm ) ); - } - - inline void - mp_allsum_i( int * local, - int * global, - int n ) { - if( !local || !global || n<1 || std::abs(local-global)comm ) ); - } - - inline void - mp_allgather_i( int * sbuf, - int * rbuf, - int n ) { - if( !sbuf || !rbuf || n<1 ) ERROR(( "Bad args" )); - TRAP( MPI_Allgather( sbuf, n, MPI_INT, rbuf, n, MPI_INT, world->comm ) ); - } - - inline void - mp_allgather_i64( int64_t * sbuf, - int64_t * rbuf, - int n ) { - if( !sbuf || !rbuf || n<1 ) ERROR(( "Bad args" )); - TRAP( MPI_Allgather( sbuf, n, MPI_LONG_LONG, rbuf, n, MPI_LONG_LONG, world->comm ) ); - } - - inline void - mp_gather_uc( unsigned char * sbuf, - unsigned char * rbuf, - int n ) { - if( !sbuf || (!rbuf && world_rank==0) || n<1 ) ERROR(( "Bad args" )); - TRAP( MPI_Gather( sbuf, n, MPI_CHAR, rbuf, n, MPI_CHAR, 0, world->comm ) ); - } - - inline void - mp_send_i( int * buf, - int n, - int dst ) { - if( !buf || n<1 || dst<0 || dst>=world_size ) ERROR(( "Bad args" )); - TRAP( MPI_Send( buf, n, MPI_INT, dst, 0, world->comm ) ); - } - - inline void - mp_recv_i( int * buf, - int n, - int src ) { - if( !buf || n<1 || src<0 || src>=world_size ) ERROR(( "Bad args" )); - TRAP( MPI_Recv( buf, n, MPI_INT, src, 0, world->comm, MPI_STATUS_IGNORE ) ); - } - - inline mp_t * - new_mp( int n_port ) { - mp_t * mp; - if( n_port<1 ) ERROR(( "Bad args" )); - MALLOC( mp, 1 ); - mp->n_port = n_port; - MALLOC( mp->rbuf, n_port ); MALLOC( mp->sbuf, n_port ); - MALLOC( mp->rbuf_sz, n_port ); MALLOC( mp->sbuf_sz, n_port ); - MALLOC( mp->rreq_sz, n_port ); MALLOC( mp->sreq_sz, n_port ); - MALLOC( mp->rreq, n_port ); MALLOC( mp->sreq, n_port ); - CLEAR( mp->rbuf, n_port ); CLEAR( mp->sbuf, n_port ); - CLEAR( mp->rbuf_sz, n_port ); CLEAR( mp->sbuf_sz, n_port ); - CLEAR( mp->rreq_sz, n_port ); CLEAR( mp->sreq_sz, n_port ); - CLEAR( mp->rreq, n_port ); CLEAR( mp->sreq, n_port ); - REGISTER_OBJECT( mp, checkpt_mp, restore_mp, NULL ); - return mp; - } - - inline void - delete_mp( mp_t * mp ) { - int port; - if( !mp ) return; - UNREGISTER_OBJECT( mp ); - for( port=0; portn_port; port++ ) { - FREE_ALIGNED( mp->rbuf[port] ); FREE_ALIGNED( mp->sbuf[port] ); +struct DMPPolicy +{ + + // FIXME-KJB: The whole sizing process in here is kinda silly and should + // be removed in the long haul. + +#define RESIZE_FACTOR 1.3125 +#define TRAP( x ) \ + do \ + { \ + int ierr = ( x ); \ + if ( ierr != MPI_SUCCESS ) \ + ERROR( ( "MPI error %i on " #x, ierr ) ); \ + } while ( 0 ) + + inline void boot_mp( int* pargc, char*** pargv ) + { + TRAP( MPI_Init( pargc, pargv ) ); + TRAP( MPI_Comm_dup( MPI_COMM_WORLD, &__world.comm ) ); + __world.parent = NULL, __world.color = 0, __world.key = 0; + TRAP( MPI_Comm_rank( __world.comm, &_world_rank ) ); + TRAP( MPI_Comm_size( __world.comm, &_world_size ) ); + REGISTER_OBJECT( &__world, checkpt_collective, restore_collective, + NULL ); + } + + inline void halt_mp( void ) + { + UNREGISTER_OBJECT( &__world ); + TRAP( MPI_Comm_free( &__world.comm ) ); + __world.parent = NULL, __world.color = 0, __world.key = 0; + __world.comm = MPI_COMM_SELF; + _world_size = 1; + _world_rank = 0; + TRAP( MPI_Finalize() ); } - FREE( mp->rreq ); FREE( mp->sreq ); - FREE( mp->rreq_sz ); FREE( mp->sreq_sz ); - FREE( mp->rbuf_sz ); FREE( mp->sbuf_sz ); - FREE( mp->rbuf ); FREE( mp->sbuf ); - FREE( mp ); - } - - inline void * ALIGNED(128) - mp_recv_buffer( mp_t * mp, - int port ) { - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - return mp->rbuf[port]; - } - - inline void * ALIGNED(128) - mp_send_buffer( mp_t * mp, - int port ) { - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - return mp->sbuf[port]; - } - - inline void - mp_size_recv_buffer( mp_t * mp, - int port, - int sz ) { - char * ALIGNED(128) buf; - - if( !mp || port<0 || port>mp->n_port || sz<1 ) ERROR(( "Bad args" )); - - // If there already a large enough buffer, we are done - if( mp->rbuf_sz[port]>=sz ) return; - - // Try to reduce the number of reallocs - sz = (int)( sz*(double)RESIZE_FACTOR ); - - // If no buffer allocated for this port, malloc it and return - if( !mp->rbuf[port] ) { - MALLOC_ALIGNED( mp->rbuf[port], sz, 128 ); - mp->rbuf_sz[port] = sz; - return; + + inline void mp_abort( int reason ) { MPI_Abort( world->comm, reason ); } + + inline void mp_barrier( void ) { TRAP( MPI_Barrier( world->comm ) ); } + + inline void mp_allsum_d( double* local, double* global, int n ) + { + if ( !local || !global || n < 1 || std::abs( local - global ) < n ) + { + ERROR( ( "Bad args" ) ); + } // if + TRAP( MPI_Allreduce( local, global, n, MPI_DOUBLE, MPI_SUM, + world->comm ) ); } - // Resize the existing buffer (preserving any data in it) - // (FIXME: THIS IS PROBABLY SILLY!) - MALLOC_ALIGNED( buf, sz, 128 ); - COPY( buf, mp->rbuf[port], mp->rbuf_sz[port] ); - FREE_ALIGNED( mp->rbuf[port] ); - mp->rbuf[port] = buf; - mp->rbuf_sz[port] = sz; - } - - inline void - mp_size_send_buffer( mp_t * mp, - int port, - int sz ) { - char * ALIGNED(128) buf; - - // Check input arguments - if( !mp || port<0 || port>mp->n_port || sz<1 ) ERROR(( "Bad args" )); - - // Is there already a large enough buffer - if( mp->sbuf_sz[port]>=sz ) return; - - // Try to reduce the number of reallocs - sz = (int)( sz*(double)RESIZE_FACTOR ); - - // If no buffer allocated for this port, malloc it and return - if( !mp->sbuf[port] ) { - MALLOC_ALIGNED( mp->sbuf[port], sz, 128 ); - mp->sbuf_sz[port] = sz; - return; + inline void mp_allsum_i( int* local, int* global, int n ) + { + if ( !local || !global || n < 1 || std::abs( local - global ) < n ) + { + ERROR( ( "Bad args" ) ); + } // if + TRAP( + MPI_Allreduce( local, global, n, MPI_INT, MPI_SUM, world->comm ) ); } - - // Resize the existing buffer (preserving any data in it) - // (FIXME: THIS IS PROBABLY SILLY!) - MALLOC_ALIGNED( buf, sz, 128 ); - COPY( buf, mp->sbuf[port], mp->sbuf_sz[port] ); - FREE_ALIGNED( mp->sbuf[port] ); - mp->sbuf[port] = buf; - mp->sbuf_sz[port] = sz; - } - - inline void - mp_begin_recv( mp_t * mp, - int port, - int sz, - int src, - int tag ) { - if( !mp || port<0 || port>=mp->n_port || sz<1 || sz>mp->rbuf_sz[port] || - src<0 || src>=world_size ) ERROR(( "Bad args" )); - mp->rreq_sz[port] = sz; - TRAP(MPI_Irecv(mp->rbuf[port], sz, MPI_BYTE, src, tag, world->comm, &mp->rreq[port])); - } - - inline void - mp_begin_send( mp_t * mp, - int port, - int sz, - int dst, - int tag ) { - if( !mp || port<0 || port>=mp->n_port || dst<0 || dst>=world_size || - sz<1 || mp->sbuf_sz[port]sreq_sz[port] = sz; - TRAP(MPI_Issend(mp->sbuf[port],sz, MPI_BYTE, dst, tag, world->comm, &mp->sreq[port])); - } - - inline void - mp_end_recv( mp_t * mp, - int port ) { - MPI_Status status; - int sz; - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - TRAP( MPI_Wait( &mp->rreq[port], &status ) ); - TRAP( MPI_Get_count( &status, MPI_BYTE, &sz ) ); - if( mp->rreq_sz[port]!=sz ) ERROR(( "Sizes do not match" )); - } - - inline void - mp_end_send( mp_t * mp, - int port ) { - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - TRAP( MPI_Wait( &mp->sreq[port], MPI_STATUS_IGNORE ) ); - } - -# undef RESIZE_FACTOR -# undef TRAP -}; // struct DMPPolicy + inline void mp_allgather_i( int* sbuf, int* rbuf, int n ) + { + if ( !sbuf || !rbuf || n < 1 ) + ERROR( ( "Bad args" ) ); + TRAP( + MPI_Allgather( sbuf, n, MPI_INT, rbuf, n, MPI_INT, world->comm ) ); + } + + inline void mp_allgather_i64( int64_t* sbuf, int64_t* rbuf, int n ) + { + if ( !sbuf || !rbuf || n < 1 ) + ERROR( ( "Bad args" ) ); + TRAP( MPI_Allgather( sbuf, n, MPI_LONG_LONG, rbuf, n, MPI_LONG_LONG, + world->comm ) ); + } + + inline void mp_gather_uc( unsigned char* sbuf, unsigned char* rbuf, int n ) + { + if ( !sbuf || ( !rbuf && world_rank == 0 ) || n < 1 ) + ERROR( ( "Bad args" ) ); + TRAP( MPI_Gather( sbuf, n, MPI_CHAR, rbuf, n, MPI_CHAR, 0, + world->comm ) ); + } + + inline void mp_send_i( int* buf, int n, int dst ) + { + if ( !buf || n < 1 || dst < 0 || dst >= world_size ) + ERROR( ( "Bad args" ) ); + TRAP( MPI_Send( buf, n, MPI_INT, dst, 0, world->comm ) ); + } + + inline void mp_recv_i( int* buf, int n, int src ) + { + if ( !buf || n < 1 || src < 0 || src >= world_size ) + ERROR( ( "Bad args" ) ); + TRAP( MPI_Recv( buf, n, MPI_INT, src, 0, world->comm, + MPI_STATUS_IGNORE ) ); + } + + inline mp_t* new_mp( int n_port ) + { + mp_t* mp; + if ( n_port < 1 ) + ERROR( ( "Bad args" ) ); + MALLOC( mp, 1 ); + mp->n_port = n_port; + MALLOC( mp->rbuf, n_port ); + MALLOC( mp->sbuf, n_port ); + MALLOC( mp->rbuf_sz, n_port ); + MALLOC( mp->sbuf_sz, n_port ); + MALLOC( mp->rreq_sz, n_port ); + MALLOC( mp->sreq_sz, n_port ); + MALLOC( mp->rreq, n_port ); + MALLOC( mp->sreq, n_port ); + CLEAR( mp->rbuf, n_port ); + CLEAR( mp->sbuf, n_port ); + CLEAR( mp->rbuf_sz, n_port ); + CLEAR( mp->sbuf_sz, n_port ); + CLEAR( mp->rreq_sz, n_port ); + CLEAR( mp->sreq_sz, n_port ); + CLEAR( mp->rreq, n_port ); + CLEAR( mp->sreq, n_port ); + REGISTER_OBJECT( mp, checkpt_mp, restore_mp, NULL ); + return mp; + } + inline void delete_mp( mp_t* mp ) + { + int port; + if ( !mp ) + return; + UNREGISTER_OBJECT( mp ); + for ( port = 0; port < mp->n_port; port++ ) + { + FREE_ALIGNED( mp->rbuf[port] ); + FREE_ALIGNED( mp->sbuf[port] ); + } + FREE( mp->rreq ); + FREE( mp->sreq ); + FREE( mp->rreq_sz ); + FREE( mp->sreq_sz ); + FREE( mp->rbuf_sz ); + FREE( mp->sbuf_sz ); + FREE( mp->rbuf ); + FREE( mp->sbuf ); + FREE( mp ); + } + + inline void* ALIGNED( 128 ) mp_recv_buffer( mp_t* mp, int port ) + { + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + return mp->rbuf[port]; + } + + inline void* ALIGNED( 128 ) mp_send_buffer( mp_t* mp, int port ) + { + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + return mp->sbuf[port]; + } + + inline void mp_size_recv_buffer( mp_t* mp, int port, int sz ) + { + char* ALIGNED( 128 ) buf; + + if ( !mp || port < 0 || port > mp->n_port || sz < 1 ) + ERROR( ( "Bad args" ) ); + + // If there already a large enough buffer, we are done + if ( mp->rbuf_sz[port] >= sz ) + return; + + // Try to reduce the number of reallocs + sz = (int)( sz * (double)RESIZE_FACTOR ); + + // If no buffer allocated for this port, malloc it and return + if ( !mp->rbuf[port] ) + { + MALLOC_ALIGNED( mp->rbuf[port], sz, 128 ); + mp->rbuf_sz[port] = sz; + return; + } + + // Resize the existing buffer (preserving any data in it) + // (FIXME: THIS IS PROBABLY SILLY!) + MALLOC_ALIGNED( buf, sz, 128 ); + COPY( buf, mp->rbuf[port], mp->rbuf_sz[port] ); + FREE_ALIGNED( mp->rbuf[port] ); + mp->rbuf[port] = buf; + mp->rbuf_sz[port] = sz; + } + + inline void mp_size_send_buffer( mp_t* mp, int port, int sz ) + { + char* ALIGNED( 128 ) buf; + + // Check input arguments + if ( !mp || port < 0 || port > mp->n_port || sz < 1 ) + ERROR( ( "Bad args" ) ); + + // Is there already a large enough buffer + if ( mp->sbuf_sz[port] >= sz ) + return; + + // Try to reduce the number of reallocs + sz = (int)( sz * (double)RESIZE_FACTOR ); + + // If no buffer allocated for this port, malloc it and return + if ( !mp->sbuf[port] ) + { + MALLOC_ALIGNED( mp->sbuf[port], sz, 128 ); + mp->sbuf_sz[port] = sz; + return; + } + + // Resize the existing buffer (preserving any data in it) + // (FIXME: THIS IS PROBABLY SILLY!) + MALLOC_ALIGNED( buf, sz, 128 ); + COPY( buf, mp->sbuf[port], mp->sbuf_sz[port] ); + FREE_ALIGNED( mp->sbuf[port] ); + mp->sbuf[port] = buf; + mp->sbuf_sz[port] = sz; + } + + inline void mp_begin_recv( mp_t* mp, int port, int sz, int src, int tag ) + { + if ( !mp || port < 0 || port >= mp->n_port || sz < 1 || + sz > mp->rbuf_sz[port] || src < 0 || src >= world_size ) + ERROR( ( "Bad args" ) ); + mp->rreq_sz[port] = sz; + TRAP( MPI_Irecv( mp->rbuf[port], sz, MPI_BYTE, src, tag, world->comm, + &mp->rreq[port] ) ); + } + + inline void mp_begin_send( mp_t* mp, int port, int sz, int dst, int tag ) + { + if ( !mp || port < 0 || port >= mp->n_port || dst < 0 || + dst >= world_size || sz < 1 || mp->sbuf_sz[port] < sz ) + ERROR( ( "Bad args" ) ); + mp->sreq_sz[port] = sz; + TRAP( MPI_Issend( mp->sbuf[port], sz, MPI_BYTE, dst, tag, world->comm, + &mp->sreq[port] ) ); + } + + inline void mp_end_recv( mp_t* mp, int port ) + { + MPI_Status status; + int sz; + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + TRAP( MPI_Wait( &mp->rreq[port], &status ) ); + TRAP( MPI_Get_count( &status, MPI_BYTE, &sz ) ); + if ( mp->rreq_sz[port] != sz ) + ERROR( ( "Sizes do not match" ) ); + } + + inline void mp_end_send( mp_t* mp, int port ) + { + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + TRAP( MPI_Wait( &mp->sreq[port], MPI_STATUS_IGNORE ) ); + } + +#undef RESIZE_FACTOR +#undef TRAP + +}; // struct DMPPolicy #endif // DMPPolicy_h diff --git a/src/util/mp/MPWrapper.h b/src/util/mp/MPWrapper.h index fe2a665c..260ea73d 100644 --- a/src/util/mp/MPWrapper.h +++ b/src/util/mp/MPWrapper.h @@ -1,16 +1,21 @@ #ifndef MPWrapper_h #define MPWrapper_h -template -class MPWrapper_T : public MPPolicy { -public: - static MPWrapper_T & instance() { static MPWrapper_T mpw; return mpw; } - // inherited public interface -private: - // hide these to keep things safe - MPWrapper_T() {} - MPWrapper_T( const MPWrapper_T & mpw ) {} - ~MPWrapper_T() {} +template +class MPWrapper_T : public MPPolicy +{ + public: + static MPWrapper_T& instance() + { + static MPWrapper_T mpw; + return mpw; + } + // inherited public interface + private: + // hide these to keep things safe + MPWrapper_T() {} + MPWrapper_T( const MPWrapper_T& mpw ) {} + ~MPWrapper_T() {} }; #ifdef USE_MPRELAY diff --git a/src/util/mp/RelayPolicy.h b/src/util/mp/RelayPolicy.h index 237dddff..7de71381 100644 --- a/src/util/mp/RelayPolicy.h +++ b/src/util/mp/RelayPolicy.h @@ -8,25 +8,31 @@ /* Define this comm and mp opaque handles */ /* FIXME: PARENT, COLOR AND KEY ARE FOR FUTURE EXPANSION */ -struct collective { - collective_t * parent; - int color; - int key; - /* FIXME: RELAY HAS NO WAY OF DOING AN MPI_COMM_SPLIT RIGHT NOW */ +struct collective +{ + collective_t* parent; + int color; + int key; + /* FIXME: RELAY HAS NO WAY OF DOING AN MPI_COMM_SPLIT RIGHT NOW */ }; -struct mp { - int n_port; - char * ALIGNED(128) * rbuf; char * ALIGNED(128) * sbuf; - int * rbuf_sz; int * sbuf_sz; - int * rreq_sz; int * sreq_sz; - int * rreq; int * sreq; +struct mp +{ + int n_port; + char* ALIGNED( 128 ) * rbuf; + char* ALIGNED( 128 ) * sbuf; + int* rbuf_sz; + int* sbuf_sz; + int* rreq_sz; + int* sreq_sz; + int* rreq; + int* sreq; }; /* Create the world collective */ -static collective_t __world = { NULL, 0, 0 }; -collective_t * _world = &__world; +static collective_t __world = {NULL, 0, 0}; +collective_t* _world = &__world; int _world_rank = 0; int _world_size = 1; @@ -35,355 +41,376 @@ int _world_size = 1; TO CREATE CHILDREN COLLECTIVES (NOT EVEN IN PRINCIPLE WITH THE CURRENT STATE OF RELAY), THIS IS BASICALLY A PLACEHOLDER. */ -void -checkpt_collective( const collective_t * comm ) { - CHECKPT_VAL( int, world_rank ); - CHECKPT_VAL( int, world_size ); +void checkpt_collective( const collective_t* comm ) +{ + CHECKPT_VAL( int, world_rank ); + CHECKPT_VAL( int, world_size ); } -collective_t * -restore_collective( void ) { - int rank, size; - RESTORE_VAL( int, rank ); - RESTORE_VAL( int, size ); - if( size!=world_size ) - ERROR(( "The number of processes that made this checkpt (%i) is different " - "from the number of processes currently (%i)", size, world_size )); - if( rank!=world_rank ) - ERROR(( "This process (%i) is reading a checkpoint previously written by " - "a different process (%i)", rank, world_rank )); - return world; +collective_t* restore_collective( void ) +{ + int rank, size; + RESTORE_VAL( int, rank ); + RESTORE_VAL( int, size ); + if ( size != world_size ) + ERROR( ( + "The number of processes that made this checkpt (%i) is different " + "from the number of processes currently (%i)", + size, world_size ) ); + if ( rank != world_rank ) + ERROR( + ( "This process (%i) is reading a checkpoint previously written by " + "a different process (%i)", + rank, world_rank ) ); + return world; } /* mp checkpointer */ -void -checkpt_mp( mp_t * mp ) { - int port; - CHECKPT( mp, 1 ); - CHECKPT( mp->rbuf, mp->n_port ); CHECKPT( mp->sbuf, mp->n_port ); - CHECKPT( mp->rbuf_sz, mp->n_port ); CHECKPT( mp->sbuf_sz, mp->n_port ); - CHECKPT( mp->rreq_sz, mp->n_port ); CHECKPT( mp->sreq_sz, mp->n_port ); - CHECKPT( mp->rreq, mp->n_port ); CHECKPT( mp->sreq, mp->n_port ); - for( port=0; portn_port; port++ ) { - CHECKPT_ALIGNED( mp->rbuf[port], mp->rbuf_sz[port], 128 ); - CHECKPT_ALIGNED( mp->sbuf[port], mp->sbuf_sz[port], 128 ); - } +void checkpt_mp( mp_t* mp ) +{ + int port; + CHECKPT( mp, 1 ); + CHECKPT( mp->rbuf, mp->n_port ); + CHECKPT( mp->sbuf, mp->n_port ); + CHECKPT( mp->rbuf_sz, mp->n_port ); + CHECKPT( mp->sbuf_sz, mp->n_port ); + CHECKPT( mp->rreq_sz, mp->n_port ); + CHECKPT( mp->sreq_sz, mp->n_port ); + CHECKPT( mp->rreq, mp->n_port ); + CHECKPT( mp->sreq, mp->n_port ); + for ( port = 0; port < mp->n_port; port++ ) + { + CHECKPT_ALIGNED( mp->rbuf[port], mp->rbuf_sz[port], 128 ); + CHECKPT_ALIGNED( mp->sbuf[port], mp->sbuf_sz[port], 128 ); + } } -mp_t * -restore_mp( void ) { - mp_t * mp; - int port; - RESTORE( mp ); - RESTORE( mp->rbuf ); RESTORE( mp->sbuf ); - RESTORE( mp->rbuf_sz ); RESTORE( mp->sbuf_sz ); - RESTORE( mp->rreq_sz ); RESTORE( mp->sreq_sz ); - RESTORE( mp->rreq ); RESTORE( mp->sreq ); - for( port=0; portn_port; port++ ) { - RESTORE_ALIGNED( mp->rbuf[port] ); - RESTORE_ALIGNED( mp->sbuf[port] ); - } - return mp; +mp_t* restore_mp( void ) +{ + mp_t* mp; + int port; + RESTORE( mp ); + RESTORE( mp->rbuf ); + RESTORE( mp->sbuf ); + RESTORE( mp->rbuf_sz ); + RESTORE( mp->sbuf_sz ); + RESTORE( mp->rreq_sz ); + RESTORE( mp->sreq_sz ); + RESTORE( mp->rreq ); + RESTORE( mp->sreq ); + for ( port = 0; port < mp->n_port; port++ ) + { + RESTORE_ALIGNED( mp->rbuf[port] ); + RESTORE_ALIGNED( mp->sbuf[port] ); + } + return mp; } -struct RelayPolicy { - - // FIXME: ERROR CODE HANDLING - - // FIXME-KJB: The whole sizing process in here is kinda silly and should - // be removed in the long haul. - -# define RESIZE_FACTOR 1.3125 - - inline void - boot_mp( int * pargc, - char *** pargv ) { - ConnectionManager::instance().init( pargc, pargv ); - P2PConnection & p2p = P2PConnection::instance(); - __world.parent = NULL, __world.color = 0, __world.key = 0; - _world_rank = p2p.global_id(); - _world_size = p2p.global_size(); - REGISTER_OBJECT( &__world, checkpt_collective, restore_collective, NULL ); - } - - inline void - halt_mp( void ) { - UNREGISTER_OBJECT( &__world ); - __world.parent = NULL, __world.color = 0, __world.key = 0; - _world_size = 1; - _world_rank = 0; - P2PConnection::instance().post( P2PTag::end ); - ConnectionManager::instance().finalize(); - } - - inline void - mp_abort( int reason ) { - P2PConnection & p2p = P2PConnection::instance(); - MPRequest request( P2PTag::abort, P2PTag::data, 1, 0 ); - p2p.post( request ); - p2p.send( &reason, 1, P2PTag::data ); - p2p.abort( reason ); - } - - inline void - mp_barrier( void ) { - P2PConnection & p2p = P2PConnection::instance(); - p2p.post( P2PTag::barrier ); - p2p.barrier(); - } - - inline void - mp_allsum_d( double * local, - double * global, - int n ) { - if( !local || !global || n<1 || abs(local-global)=world_size ) ERROR(( "Bad args" )); - P2PConnection & p2p = P2PConnection::instance(); - MPRequest request( P2PTag::send, P2PTag::data, n, 0, dst ); - p2p.post( request ); - p2p.send( buf, request.count, request.tag ); - } - - inline void - mp_recv_i( int * buf, - int n, - int src ) { - if( !buf || n<1 || src<0 || src>=world_size ) ERROR(( "Bad args" )); - P2PConnection & p2p = P2PConnection::instance(); - MPRequest request( P2PTag::recv, P2PTag::data, n, 0, src ); - p2p.post( request ); - p2p.recv( buf, request.count, request.tag, request.id ); - } - - /* ---- BEGIN EXACT CUT-AND-PASTE JOB FROM DMPPOLICY ---- */ - /* FIXME-KJB: AT THIS POINT, MUCH OF MP IN DMP AND RELAY COULD BE EXTRACTED - INTO A UNIFIED IMPLEMENTATION (AND, AT THE SAME TIME, THE API FIXED) */ - - inline mp_t * - new_mp( int n_port ) { - mp_t * mp; - if( n_port<1 ) ERROR(( "Bad args" )); - MALLOC( mp, 1 ); - mp->n_port = n_port; - MALLOC( mp->rbuf, n_port ); MALLOC( mp->sbuf, n_port ); - MALLOC( mp->rbuf_sz, n_port ); MALLOC( mp->sbuf_sz, n_port ); - MALLOC( mp->rreq_sz, n_port ); MALLOC( mp->sreq_sz, n_port ); - MALLOC( mp->rreq, n_port ); MALLOC( mp->sreq, n_port ); - CLEAR( mp->rbuf, n_port ); CLEAR( mp->sbuf, n_port ); - CLEAR( mp->rbuf_sz, n_port ); CLEAR( mp->sbuf_sz, n_port ); - CLEAR( mp->rreq_sz, n_port ); CLEAR( mp->sreq_sz, n_port ); - CLEAR( mp->rreq, n_port ); CLEAR( mp->sreq, n_port ); - REGISTER_OBJECT( mp, checkpt_mp, restore_mp, NULL ); - return mp; - } - - inline void - delete_mp( mp_t * mp ) { - int port; - if( !mp ) return; - UNREGISTER_OBJECT( mp ); - for( port=0; portn_port; port++ ) { - FREE_ALIGNED( mp->rbuf[port] ); FREE_ALIGNED( mp->sbuf[port] ); +struct RelayPolicy +{ + + // FIXME: ERROR CODE HANDLING + + // FIXME-KJB: The whole sizing process in here is kinda silly and should + // be removed in the long haul. + +#define RESIZE_FACTOR 1.3125 + + inline void boot_mp( int* pargc, char*** pargv ) + { + ConnectionManager::instance().init( pargc, pargv ); + P2PConnection& p2p = P2PConnection::instance(); + __world.parent = NULL, __world.color = 0, __world.key = 0; + _world_rank = p2p.global_id(); + _world_size = p2p.global_size(); + REGISTER_OBJECT( &__world, checkpt_collective, restore_collective, + NULL ); } - FREE( mp->rreq ); FREE( mp->sreq ); - FREE( mp->rreq_sz ); FREE( mp->sreq_sz ); - FREE( mp->rbuf_sz ); FREE( mp->sbuf_sz ); - FREE( mp->rbuf ); FREE( mp->sbuf ); - FREE( mp ); - } - - inline void * ALIGNED(128) - mp_recv_buffer( mp_t * mp, - int port ) { - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - return mp->rbuf[port]; - } - - inline void * ALIGNED(128) - mp_send_buffer( mp_t * mp, - int port ) { - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - return mp->sbuf[port]; - } - - inline void - mp_size_recv_buffer( mp_t * mp, - int port, - int sz ) { - char * ALIGNED(128) buf; - - if( !mp || port<0 || port>mp->n_port || sz<1 ) ERROR(( "Bad args" )); - - // If there already a large enough buffer, we are done - if( mp->rbuf_sz[port]>=sz ) return; - - // Try to reduce the number of reallocs - sz = (int)( sz*(double)RESIZE_FACTOR ); - - // If no buffer allocated for this port, malloc it and return - if( !mp->rbuf[port] ) { - MALLOC_ALIGNED( mp->rbuf[port], sz, 128 ); - mp->rbuf_sz[port] = sz; - return; + + inline void halt_mp( void ) + { + UNREGISTER_OBJECT( &__world ); + __world.parent = NULL, __world.color = 0, __world.key = 0; + _world_size = 1; + _world_rank = 0; + P2PConnection::instance().post( P2PTag::end ); + ConnectionManager::instance().finalize(); } - // Resize the existing buffer (preserving any data in it) - // (FIXME: THIS IS PROBABLY SILLY!) - MALLOC_ALIGNED( buf, sz, 128 ); - COPY( buf, mp->rbuf[port], mp->rbuf_sz[port] ); - FREE_ALIGNED( mp->rbuf[port] ); - mp->rbuf[port] = buf; - mp->rbuf_sz[port] = sz; - } - - inline void - mp_size_send_buffer( mp_t * mp, - int port, - int sz ) { - char * ALIGNED(128) buf; - - // Check input arguments - if( !mp || port<0 || port>mp->n_port || sz<1 ) ERROR(( "Bad args" )); - - // Is there already a large enough buffer - if( mp->sbuf_sz[port]>=sz ) return; - - // Try to reduce the number of reallocs - sz = (int)( sz*(double)RESIZE_FACTOR ); - - // If no buffer allocated for this port, malloc it and return - if( !mp->sbuf[port] ) { - MALLOC_ALIGNED( mp->sbuf[port], sz, 128 ); - mp->sbuf_sz[port] = sz; - return; + inline void mp_abort( int reason ) + { + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::abort, P2PTag::data, 1, 0 ); + p2p.post( request ); + p2p.send( &reason, 1, P2PTag::data ); + p2p.abort( reason ); } - - // Resize the existing buffer (preserving any data in it) - // (FIXME: THIS IS PROBABLY SILLY!) - MALLOC_ALIGNED( buf, sz, 128 ); - COPY( buf, mp->sbuf[port], mp->sbuf_sz[port] ); - FREE_ALIGNED( mp->sbuf[port] ); - mp->sbuf[port] = buf; - mp->sbuf_sz[port] = sz; - } - - /* ---- END CUT-AND-PASTE JOB FROM DMPPOLICY ---- */ - - inline void - mp_begin_recv( mp_t * mp, - int port, - int sz, - int src, - int tag ) { - if( !mp || port<0 || port>=mp->n_port || sz<1 || sz>mp->rbuf_sz[port] || - src<0 || src>=world_size ) ERROR(( "Bad args" )); - P2PConnection & p2p = P2PConnection::instance(); - mp->rreq_sz[port] = sz; - MPRequest request( P2PTag::irecv, tag, sz, port, src ); - p2p.post( request ); - p2p.irecv( static_cast(mp->rbuf[port]), sz, tag, port ); - } - - inline void - mp_begin_send( mp_t * mp, - int port, - int sz, - int dst, - int tag ) { - if( !mp || port<0 || port>=mp->n_port || sz<1 || sz>mp->sbuf_sz[port] || - dst<0 || dst>=world_size ) ERROR(( "Bad args" )); - P2PConnection & p2p = P2PConnection::instance(); - mp->sreq_sz[port] = sz; - MPRequest request( P2PTag::isend, tag, sz, port, dst ); - p2p.post( request ); - p2p.isend( static_cast(mp->sbuf[port]), sz, tag, port ); - } - - inline void - mp_end_recv( mp_t * mp, - int port ) { - int sz; - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - P2PConnection & p2p = P2PConnection::instance(); - MPRequest request( P2PTag::wait_recv, 0, 0, port ); - p2p.post( request ); - p2p.wait_recv( port ); - p2p.get_count( port, sz ); - /* FIXME: SHOULDN'T WE CHECK SZ==RREQ_SZ HERE? */ - } - - inline void - mp_end_send( mp_t * mp, - int port ) { - if( !mp || port<0 || port>=mp->n_port ) ERROR(( "Bad args" )); - P2PConnection & p2p = P2PConnection::instance(); - MPRequest request( P2PTag::wait_send, 0, 0, port ); - p2p.post( request ); - p2p.wait_send( port ); - } - -# undef RESIZE_FACTOR + + inline void mp_barrier( void ) + { + P2PConnection& p2p = P2PConnection::instance(); + p2p.post( P2PTag::barrier ); + p2p.barrier(); + } + + inline void mp_allsum_d( double* local, double* global, int n ) + { + if ( !local || !global || n < 1 || abs( local - global ) < n ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::allreduce_sum_double, P2PTag::data, n, 0 ); + p2p.post( request ); + p2p.send( local, request.count, request.tag ); + p2p.recv( global, request.count, request.tag, request.id ); + } + + inline void mp_allsum_i( int* local, int* global, int n ) + { + if ( !local || !global || n < 1 || abs( local - global ) < n ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::allreduce_sum_int, P2PTag::data, n, 0 ); + p2p.post( request ); + p2p.send( local, request.count, request.tag ); + p2p.recv( global, request.count, request.tag, request.id ); + } + + inline void mp_allgather_i( int* sbuf, int* rbuf, int n ) + { + if ( !sbuf || !rbuf || n < 1 ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::allgather_int, P2PTag::data, n, 0 ); + p2p.post( request ); + p2p.send( sbuf, request.count, request.tag ); + p2p.recv( rbuf, request.count * world_size, request.tag, request.id ); + } + + inline void mp_allgather_i64( int64_t* sbuf, int64_t* rbuf, int n ) + { + if ( !sbuf || !rbuf || n < 1 ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::allgather_int64, P2PTag::data, n, 0 ); + p2p.post( request ); + p2p.send( sbuf, request.count, request.tag ); + p2p.recv( rbuf, request.count * world_size, request.tag, request.id ); + } + + inline void mp_gather_uc( unsigned char* sbuf, unsigned char* rbuf, int n ) + { + if ( !sbuf || ( !rbuf && world_rank == 0 ) || n < 1 ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::gather_uc, P2PTag::data, n, 0 ); + p2p.post( request ); + p2p.send( sbuf, request.count, request.tag ); + if ( world_rank == 0 ) + p2p.recv( rbuf, request.count * world_size, request.tag, + request.id ); + } + + inline void mp_send_i( int* buf, int n, int dst ) + { + if ( !buf || n < 1 || dst < 0 || dst >= world_size ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::send, P2PTag::data, n, 0, dst ); + p2p.post( request ); + p2p.send( buf, request.count, request.tag ); + } + + inline void mp_recv_i( int* buf, int n, int src ) + { + if ( !buf || n < 1 || src < 0 || src >= world_size ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::recv, P2PTag::data, n, 0, src ); + p2p.post( request ); + p2p.recv( buf, request.count, request.tag, request.id ); + } + + /* ---- BEGIN EXACT CUT-AND-PASTE JOB FROM DMPPOLICY ---- */ + /* FIXME-KJB: AT THIS POINT, MUCH OF MP IN DMP AND RELAY COULD BE EXTRACTED + INTO A UNIFIED IMPLEMENTATION (AND, AT THE SAME TIME, THE API FIXED) */ + + inline mp_t* new_mp( int n_port ) + { + mp_t* mp; + if ( n_port < 1 ) + ERROR( ( "Bad args" ) ); + MALLOC( mp, 1 ); + mp->n_port = n_port; + MALLOC( mp->rbuf, n_port ); + MALLOC( mp->sbuf, n_port ); + MALLOC( mp->rbuf_sz, n_port ); + MALLOC( mp->sbuf_sz, n_port ); + MALLOC( mp->rreq_sz, n_port ); + MALLOC( mp->sreq_sz, n_port ); + MALLOC( mp->rreq, n_port ); + MALLOC( mp->sreq, n_port ); + CLEAR( mp->rbuf, n_port ); + CLEAR( mp->sbuf, n_port ); + CLEAR( mp->rbuf_sz, n_port ); + CLEAR( mp->sbuf_sz, n_port ); + CLEAR( mp->rreq_sz, n_port ); + CLEAR( mp->sreq_sz, n_port ); + CLEAR( mp->rreq, n_port ); + CLEAR( mp->sreq, n_port ); + REGISTER_OBJECT( mp, checkpt_mp, restore_mp, NULL ); + return mp; + } + + inline void delete_mp( mp_t* mp ) + { + int port; + if ( !mp ) + return; + UNREGISTER_OBJECT( mp ); + for ( port = 0; port < mp->n_port; port++ ) + { + FREE_ALIGNED( mp->rbuf[port] ); + FREE_ALIGNED( mp->sbuf[port] ); + } + FREE( mp->rreq ); + FREE( mp->sreq ); + FREE( mp->rreq_sz ); + FREE( mp->sreq_sz ); + FREE( mp->rbuf_sz ); + FREE( mp->sbuf_sz ); + FREE( mp->rbuf ); + FREE( mp->sbuf ); + FREE( mp ); + } + + inline void* ALIGNED( 128 ) mp_recv_buffer( mp_t* mp, int port ) + { + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + return mp->rbuf[port]; + } + + inline void* ALIGNED( 128 ) mp_send_buffer( mp_t* mp, int port ) + { + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + return mp->sbuf[port]; + } + + inline void mp_size_recv_buffer( mp_t* mp, int port, int sz ) + { + char* ALIGNED( 128 ) buf; + + if ( !mp || port < 0 || port > mp->n_port || sz < 1 ) + ERROR( ( "Bad args" ) ); + + // If there already a large enough buffer, we are done + if ( mp->rbuf_sz[port] >= sz ) + return; + + // Try to reduce the number of reallocs + sz = (int)( sz * (double)RESIZE_FACTOR ); + + // If no buffer allocated for this port, malloc it and return + if ( !mp->rbuf[port] ) + { + MALLOC_ALIGNED( mp->rbuf[port], sz, 128 ); + mp->rbuf_sz[port] = sz; + return; + } + + // Resize the existing buffer (preserving any data in it) + // (FIXME: THIS IS PROBABLY SILLY!) + MALLOC_ALIGNED( buf, sz, 128 ); + COPY( buf, mp->rbuf[port], mp->rbuf_sz[port] ); + FREE_ALIGNED( mp->rbuf[port] ); + mp->rbuf[port] = buf; + mp->rbuf_sz[port] = sz; + } + + inline void mp_size_send_buffer( mp_t* mp, int port, int sz ) + { + char* ALIGNED( 128 ) buf; + + // Check input arguments + if ( !mp || port < 0 || port > mp->n_port || sz < 1 ) + ERROR( ( "Bad args" ) ); + + // Is there already a large enough buffer + if ( mp->sbuf_sz[port] >= sz ) + return; + + // Try to reduce the number of reallocs + sz = (int)( sz * (double)RESIZE_FACTOR ); + + // If no buffer allocated for this port, malloc it and return + if ( !mp->sbuf[port] ) + { + MALLOC_ALIGNED( mp->sbuf[port], sz, 128 ); + mp->sbuf_sz[port] = sz; + return; + } + + // Resize the existing buffer (preserving any data in it) + // (FIXME: THIS IS PROBABLY SILLY!) + MALLOC_ALIGNED( buf, sz, 128 ); + COPY( buf, mp->sbuf[port], mp->sbuf_sz[port] ); + FREE_ALIGNED( mp->sbuf[port] ); + mp->sbuf[port] = buf; + mp->sbuf_sz[port] = sz; + } + + /* ---- END CUT-AND-PASTE JOB FROM DMPPOLICY ---- */ + + inline void mp_begin_recv( mp_t* mp, int port, int sz, int src, int tag ) + { + if ( !mp || port < 0 || port >= mp->n_port || sz < 1 || + sz > mp->rbuf_sz[port] || src < 0 || src >= world_size ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + mp->rreq_sz[port] = sz; + MPRequest request( P2PTag::irecv, tag, sz, port, src ); + p2p.post( request ); + p2p.irecv( static_cast( mp->rbuf[port] ), sz, tag, port ); + } + + inline void mp_begin_send( mp_t* mp, int port, int sz, int dst, int tag ) + { + if ( !mp || port < 0 || port >= mp->n_port || sz < 1 || + sz > mp->sbuf_sz[port] || dst < 0 || dst >= world_size ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + mp->sreq_sz[port] = sz; + MPRequest request( P2PTag::isend, tag, sz, port, dst ); + p2p.post( request ); + p2p.isend( static_cast( mp->sbuf[port] ), sz, tag, port ); + } + + inline void mp_end_recv( mp_t* mp, int port ) + { + int sz; + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::wait_recv, 0, 0, port ); + p2p.post( request ); + p2p.wait_recv( port ); + p2p.get_count( port, sz ); + /* FIXME: SHOULDN'T WE CHECK SZ==RREQ_SZ HERE? */ + } + + inline void mp_end_send( mp_t* mp, int port ) + { + if ( !mp || port < 0 || port >= mp->n_port ) + ERROR( ( "Bad args" ) ); + P2PConnection& p2p = P2PConnection::instance(); + MPRequest request( P2PTag::wait_send, 0, 0, port ); + p2p.post( request ); + p2p.wait_send( port ); + } + +#undef RESIZE_FACTOR }; // struct RelayPolicy diff --git a/src/util/mp/mp.h b/src/util/mp/mp.h index 63d2003a..bc19abd4 100644 --- a/src/util/mp/mp.h +++ b/src/util/mp/mp.h @@ -17,138 +17,91 @@ typedef struct mp mp_t; critical sections and do other tricks liking limiting the number of simultaneous I/O operatorions on large jobs. These macros use blocking send/receives to serialize writes. - + For example, to set up a turnstile that allows at most N simultaneous writes: - + BEGIN_TURNSTILE( N ) { ... do write ... } END_TURNSTILE - + BEGIN_TURNSTILE(1) (i.e., one turnstile) effectively serializes the code. This construct is robust. Turnstiles should not be nested. Code in turnstiles should not attempt to communicate with other processes. - + If everything were perfectly synchronous, then, when using a 10 turnstiles, processes 0:9 would enter the turnstile, followed by 10:19, followed by 20:29, ... */ -#define BEGIN_TURNSTILE(n_turnstile) do { \ - int _n_turnstile = (n_turnstile), _baton; \ - if( world_rank>=_n_turnstile ) \ - mp_recv_i( &_baton, 1, world_rank-_n_turnstile ); \ - do - -#define END_TURNSTILE while(0); \ - if( world_rank+_n_turnstile < world_size ) \ - mp_send_i( &_baton, 1, world_rank+_n_turnstile ); \ - } while(0) +#define BEGIN_TURNSTILE( n_turnstile ) \ + do \ + { \ + int _n_turnstile = ( n_turnstile ), _baton; \ + if ( world_rank >= _n_turnstile ) \ + mp_recv_i( &_baton, 1, world_rank - _n_turnstile ); \ + do + +#define END_TURNSTILE \ + while ( 0 ) \ + ; \ + if ( world_rank + _n_turnstile < world_size ) \ + mp_send_i( &_baton, 1, world_rank + _n_turnstile ); \ + } \ + while ( 0 ) BEGIN_C_DECLS -void -boot_mp( int * pargc, - char *** pargv ); +void boot_mp( int* pargc, char*** pargv ); -void -halt_mp( void ); +void halt_mp( void ); -void -mp_abort( int reason ); +void mp_abort( int reason ); /* Collective commucations */ -void -mp_barrier( void ); +void mp_barrier( void ); -void -mp_allsum_d( double * local, - double * global, - int n ); +void mp_allsum_d( double* local, double* global, int n ); -void -mp_allsum_i( int * local, - int * global, - int n ); +void mp_allsum_i( int* local, int* global, int n ); -void -mp_allgather_i( int * sbuf, - int * rbuf, - int n ); +void mp_allgather_i( int* sbuf, int* rbuf, int n ); -void -mp_allgather_i64( int64_t * sbuf, - int64_t * rbuf, - int n ); +void mp_allgather_i64( int64_t* sbuf, int64_t* rbuf, int n ); // FIXME: THIS API SHOULD TAKE THE ROOT NODE -void -mp_gather_uc( unsigned char * sbuf, - unsigned char * rbuf, - int n ); +void mp_gather_uc( unsigned char* sbuf, unsigned char* rbuf, int n ); /* Turnstile communication primitives */ // FIXME: MESSAGE TAGGING ISSUES? -void -mp_send_i( int * buf, - int n, - int dst ); +void mp_send_i( int* buf, int n, int dst ); -void -mp_recv_i( int * buf, - int n, - int src ); +void mp_recv_i( int* buf, int n, int src ); /* Buffered non-blocking point-to-point communications */ -mp_t * -new_mp( int n_port ); +mp_t* new_mp( int n_port ); -void -delete_mp( mp_t * mp ); +void delete_mp( mp_t* mp ); -void * ALIGNED(128) -mp_recv_buffer( mp_t * mp, - int port ); +void* ALIGNED( 128 ) mp_recv_buffer( mp_t* mp, int port ); -void * ALIGNED(128) -mp_send_buffer( mp_t * mp, - int port ); +void* ALIGNED( 128 ) mp_send_buffer( mp_t* mp, int port ); -void -mp_size_recv_buffer( mp_t * mp, - int port, - int size ); +void mp_size_recv_buffer( mp_t* mp, int port, int size ); -void -mp_size_send_buffer( mp_t * mp, - int port, - int size ); +void mp_size_send_buffer( mp_t* mp, int port, int size ); // FIXME: MP REALLY SHOULD HANDLE THE MESSAGE TAGGING -void -mp_begin_recv( mp_t * mp, - int port, - int sz, - int src, - int tag ); - -void -mp_begin_send( mp_t * mp, - int port, - int sz, - int dst, - int tag ); - -void -mp_end_recv( mp_t * mp, - int rbuf ); - -void -mp_end_send( mp_t * mp, - int sbuf ); +void mp_begin_recv( mp_t* mp, int port, int sz, int src, int tag ); + +void mp_begin_send( mp_t* mp, int port, int sz, int dst, int tag ); + +void mp_end_recv( mp_t* mp, int rbuf ); + +void mp_end_send( mp_t* mp, int sbuf ); END_C_DECLS diff --git a/src/util/pipelines/pipelines.h b/src/util/pipelines/pipelines.h index f4259f70..ed874d39 100644 --- a/src/util/pipelines/pipelines.h +++ b/src/util/pipelines/pipelines.h @@ -7,10 +7,13 @@ #include "../util_base.h" -enum { MAX_PIPELINE = 272 }; +enum +{ + MAX_PIPELINE = 272 +}; // Is this even related to pipelines. Maybe this should be in util_base.h. -# define PAD_STRUCT( sz ) +#define PAD_STRUCT( sz ) //----------------------------------------------------------------------------// // Make sure that pipelines_pthreads.h and pipelines_openmp.h can only be @@ -23,7 +26,7 @@ enum { MAX_PIPELINE = 272 }; // If using Pthreads, include pipelines_pthreads.h. //----------------------------------------------------------------------------// -#if defined(VPIC_USE_PTHREADS) +#if defined( VPIC_USE_PTHREADS ) #include "pipelines_pthreads.h" @@ -31,7 +34,7 @@ enum { MAX_PIPELINE = 272 }; // If using OpenMP, include pipelines_openmp.h. //----------------------------------------------------------------------------// -#elif defined(VPIC_USE_OPENMP) +#elif defined( VPIC_USE_OPENMP ) #include "pipelines_openmp.h" @@ -54,4 +57,4 @@ enum { MAX_PIPELINE = 272 }; #undef THREAD_REROUTE -#endif // _pipelines_h_ +#endif // _pipelines_h_ diff --git a/src/util/pipelines/pipelines_exec.h b/src/util/pipelines/pipelines_exec.h index 58736032..f83c8702 100644 --- a/src/util/pipelines/pipelines_exec.h +++ b/src/util/pipelines/pipelines_exec.h @@ -7,9 +7,9 @@ #include "pipelines.h" +#include "../v16/v16.h" #include "../v4/v4.h" #include "../v8/v8.h" -#include "../v16/v16.h" //----------------------------------------------------------------------------// // Make sure that pipelines_exec_pth.h and pipelines_exec_omp.h can only be @@ -22,7 +22,7 @@ // If using Pthreads, include pipelines_exec_pth.h. //----------------------------------------------------------------------------// -#if defined(VPIC_USE_PTHREADS) +#if defined( VPIC_USE_PTHREADS ) #include "pipelines_exec_pth.h" @@ -30,7 +30,7 @@ // If using OpenMP, include pipelines_exec_omp.h. //----------------------------------------------------------------------------// -#elif defined(VPIC_USE_OPENMP) +#elif defined( VPIC_USE_OPENMP ) #include "pipelines_exec_omp.h" @@ -52,4 +52,4 @@ #undef THREAD_REROUTE -#endif // _pipelines_exec_h_ +#endif // _pipelines_exec_h_ diff --git a/src/util/pipelines/pipelines_exec_omp.h b/src/util/pipelines/pipelines_exec_omp.h index bbd24a87..88e5cd2f 100644 --- a/src/util/pipelines/pipelines_exec_omp.h +++ b/src/util/pipelines/pipelines_exec_omp.h @@ -12,7 +12,7 @@ // TODO: this could be removed as VPIC (elsewhere) knows how to stringify from a // macro -#define TOSTRING( a ) #a //convert pragma directives to string +#define TOSTRING( a ) #a // convert pragma directives to string #define WAIT_PIPELINES() _Pragma( TOSTRING( omp barrier ) ) @@ -22,18 +22,20 @@ // the scalar pipeline. //----------------------------------------------------------------------------// -#if defined(V16_ACCELERATION) && defined(HAS_V16_PIPELINE) +#if defined( V16_ACCELERATION ) && defined( HAS_V16_PIPELINE ) -# define EXEC_PIPELINES(name, args, str) \ - _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \ - { \ +#define EXEC_PIPELINES( name, args, str ) \ + _Pragma( \ + TOSTRING( omp parallel num_threads( N_PIPELINE ) shared( args ) ) ) \ + { \ _Pragma( TOSTRING( omp for ) ) \ - for( int id = 0; id < N_PIPELINE; id++ ) \ - { \ - name##_pipeline_v16( args+id*sizeof(*args)*str, id, N_PIPELINE ); \ - } \ - } \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ); + for( int id = 0; id < N_PIPELINE; id++ ) \ + { \ + name##_pipeline_v16( args + id * sizeof( *args ) * str, id, \ + N_PIPELINE ); \ + } \ + } \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ); //----------------------------------------------------------------------------// // Macro defines to support v8 simd vector acceleration. Uses thread @@ -41,18 +43,20 @@ // the scalar pipeline. //----------------------------------------------------------------------------// -#elif defined(V8_ACCELERATION) && defined(HAS_V8_PIPELINE) +#elif defined( V8_ACCELERATION ) && defined( HAS_V8_PIPELINE ) -# define EXEC_PIPELINES(name, args, str) \ - _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \ - { \ +#define EXEC_PIPELINES( name, args, str ) \ + _Pragma( \ + TOSTRING( omp parallel num_threads( N_PIPELINE ) shared( args ) ) ) \ + { \ _Pragma( TOSTRING( omp for ) ) \ - for( int id = 0; id < N_PIPELINE; id++ ) \ - { \ - name##_pipeline_v8( args+id*sizeof(*args)*str, id, N_PIPELINE ); \ - } \ - } \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ); + for( int id = 0; id < N_PIPELINE; id++ ) \ + { \ + name##_pipeline_v8( args + id * sizeof( *args ) * str, id, \ + N_PIPELINE ); \ + } \ + } \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ); //----------------------------------------------------------------------------// // Macro defines to support v4 simd vector acceleration. Uses thread @@ -60,18 +64,20 @@ // the scalar pipeline. //----------------------------------------------------------------------------// -#elif defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE) +#elif defined( V4_ACCELERATION ) && defined( HAS_V4_PIPELINE ) -# define EXEC_PIPELINES(name, args, str) \ - _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \ - { \ +#define EXEC_PIPELINES( name, args, str ) \ + _Pragma( \ + TOSTRING( omp parallel num_threads( N_PIPELINE ) shared( args ) ) ) \ + { \ _Pragma( TOSTRING( omp for ) ) \ - for( int id = 0; id < N_PIPELINE; id++ ) \ - { \ - name##_pipeline_v4( args+id*sizeof(*args)*str, id, N_PIPELINE ); \ - } \ - } \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ); + for( int id = 0; id < N_PIPELINE; id++ ) \ + { \ + name##_pipeline_v4( args + id * sizeof( *args ) * str, id, \ + N_PIPELINE ); \ + } \ + } \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ); //----------------------------------------------------------------------------// // Macro defines to support the standard implementation which does not use @@ -81,17 +87,19 @@ #else -# define EXEC_PIPELINES(name, args, str) \ - _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \ - { \ +#define EXEC_PIPELINES( name, args, str ) \ + _Pragma( \ + TOSTRING( omp parallel num_threads( N_PIPELINE ) shared( args ) ) ) \ + { \ _Pragma( TOSTRING( omp for ) ) \ - for( int id = 0; id < N_PIPELINE; id++ ) \ - { \ - name##_pipeline_scalar( args+id*sizeof(*args)*str, id, N_PIPELINE ); \ - } \ - } \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ); + for( int id = 0; id < N_PIPELINE; id++ ) \ + { \ + name##_pipeline_scalar( args + id * sizeof( *args ) * str, id, \ + N_PIPELINE ); \ + } \ + } \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ); #endif -#endif // _pipelines_exec_omp_h_ +#endif // _pipelines_exec_omp_h_ diff --git a/src/util/pipelines/pipelines_exec_pth.h b/src/util/pipelines/pipelines_exec_pth.h index ecfa0910..576d5463 100644 --- a/src/util/pipelines/pipelines_exec_pth.h +++ b/src/util/pipelines/pipelines_exec_pth.h @@ -10,7 +10,7 @@ // as the standard case that does not use vector acceleration. //----------------------------------------------------------------------------// -# define WAIT_PIPELINES() thread.wait() +#define WAIT_PIPELINES() thread.wait() //----------------------------------------------------------------------------// // Macro defines to support v16 simd vector acceleration. Uses thread @@ -18,12 +18,12 @@ // the scalar pipeline. //----------------------------------------------------------------------------// -#if defined(V16_ACCELERATION) && defined(HAS_V16_PIPELINE) +#if defined( V16_ACCELERATION ) && defined( HAS_V16_PIPELINE ) -# define EXEC_PIPELINES(name,args,str) \ - thread.dispatch( (pipeline_func_t)name##_pipeline_v16, \ - args, sizeof(*args), str ); \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ) +#define EXEC_PIPELINES( name, args, str ) \ + thread.dispatch( (pipeline_func_t)name##_pipeline_v16, args, \ + sizeof( *args ), str ); \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ) //----------------------------------------------------------------------------// // Macro defines to support v8 simd vector acceleration. Uses thread @@ -31,12 +31,12 @@ // the scalar pipeline. //----------------------------------------------------------------------------// -#elif defined(V8_ACCELERATION) && defined(HAS_V8_PIPELINE) +#elif defined( V8_ACCELERATION ) && defined( HAS_V8_PIPELINE ) -# define EXEC_PIPELINES(name,args,str) \ - thread.dispatch( (pipeline_func_t)name##_pipeline_v8, \ - args, sizeof(*args), str ); \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ) +#define EXEC_PIPELINES( name, args, str ) \ + thread.dispatch( (pipeline_func_t)name##_pipeline_v8, args, \ + sizeof( *args ), str ); \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ) //----------------------------------------------------------------------------// // Macro defines to support v4 simd vector acceleration. Uses thread @@ -44,12 +44,12 @@ // the scalar pipeline. //----------------------------------------------------------------------------// -#elif defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE) +#elif defined( V4_ACCELERATION ) && defined( HAS_V4_PIPELINE ) -# define EXEC_PIPELINES(name,args,str) \ - thread.dispatch( (pipeline_func_t)name##_pipeline_v4, \ - args, sizeof(*args), str ); \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ) +#define EXEC_PIPELINES( name, args, str ) \ + thread.dispatch( (pipeline_func_t)name##_pipeline_v4, args, \ + sizeof( *args ), str ); \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ) //----------------------------------------------------------------------------// // Macro defines to support the standard implementation which does not use @@ -59,11 +59,11 @@ #else -# define EXEC_PIPELINES(name,args,str) \ - thread.dispatch( (pipeline_func_t)name##_pipeline_scalar, \ - args, sizeof(*args), str ); \ - name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE ) +#define EXEC_PIPELINES( name, args, str ) \ + thread.dispatch( (pipeline_func_t)name##_pipeline_scalar, args, \ + sizeof( *args ), str ); \ + name##_pipeline_scalar( args + str * N_PIPELINE, N_PIPELINE, N_PIPELINE ) #endif -#endif // _pipelines_exec_pth_h_ +#endif // _pipelines_exec_pth_h_ diff --git a/src/util/pipelines/pipelines_openmp.h b/src/util/pipelines/pipelines_openmp.h index d76429d2..2e6f187b 100644 --- a/src/util/pipelines/pipelines_openmp.h +++ b/src/util/pipelines/pipelines_openmp.h @@ -23,17 +23,15 @@ typedef struct omp_container { - int n_pipeline; - int dispatch_to_host; + int n_pipeline; + int dispatch_to_host; - //const char * f_dump; - //const char * e_dump; + // const char * f_dump; + // const char * e_dump; - //boot gets the number of pipelines from the cmd line - //and passes it on to the EXEC_PIPELINS macro eventually - void - (*boot)( int * pargc, - char *** pargv ); + // boot gets the number of pipelines from the cmd line + // and passes it on to the EXEC_PIPELINS macro eventually + void ( *boot )( int* pargc, char*** pargv ); } omp_container_t; BEGIN_C_DECLS @@ -42,4 +40,4 @@ extern omp_container_t omp_helper; END_C_DECLS -#endif // _pipelines_openmp_h_ +#endif // _pipelines_openmp_h_ diff --git a/src/util/pipelines/pipelines_pthreads.h b/src/util/pipelines/pipelines_pthreads.h index 9d90936e..2e51d33f 100644 --- a/src/util/pipelines/pipelines_pthreads.h +++ b/src/util/pipelines/pipelines_pthreads.h @@ -11,68 +11,59 @@ // pipelines dispatched. //----------------------------------------------------------------------------// -typedef void -(*pipeline_func_t)( void * args, - int pipeline_rank, - int n_pipeline ); +typedef void ( *pipeline_func_t )( void* args, int pipeline_rank, + int n_pipeline ); //----------------------------------------------------------------------------// // Generic macros that are used for all cases of vector acceleration as well // as the standard case that does not use vector acceleration. //----------------------------------------------------------------------------// -# define N_PIPELINE thread.n_pipeline +#define N_PIPELINE thread.n_pipeline //////////////////////////////////////////////////////////////////////////////// typedef struct pipeline_dispatcher { - // n_pipelines indicates the number of pipelines currently running. - // Technically, this should be read only for users! - - int n_pipeline; - - // boot creates the number of pipelines requested (in the command - // line args). Generally, this is number of cores on a node if - // using symmetric multiprocessing or the number of pipeline - // processors if using heterogeneous multiprocessing. - - void - (*boot)( int * pargc, - char *** pargv ); - - // halt destroys all the resources used by the dispatcher created - // in boot. - - void (*halt)( void ); - - // dispatch begins executing the given pipeline function on all the - // pipelines. - // - // pipeline is the pipeline function to execute on the pipelines. - // - // args is an array of arguments to pass to each pipeline. - // - // sz gives the byte size of an element of the argument - // array. - // - // str gives the element stride between elements of the argument - // array. Pass 0 if you want all pipelines to get the same - // arguments. - // - // If the pipeline functions do not take arguments, use NULL for - // args and 0 for sz and str - - void - (*dispatch)( pipeline_func_t pipeline, - void * args, - int sz, - int str ); - - // wait waits for the previous dispatch to complete. - - void - (*wait)( void ); + // n_pipelines indicates the number of pipelines currently running. + // Technically, this should be read only for users! + + int n_pipeline; + + // boot creates the number of pipelines requested (in the command + // line args). Generally, this is number of cores on a node if + // using symmetric multiprocessing or the number of pipeline + // processors if using heterogeneous multiprocessing. + + void ( *boot )( int* pargc, char*** pargv ); + + // halt destroys all the resources used by the dispatcher created + // in boot. + + void ( *halt )( void ); + + // dispatch begins executing the given pipeline function on all the + // pipelines. + // + // pipeline is the pipeline function to execute on the pipelines. + // + // args is an array of arguments to pass to each pipeline. + // + // sz gives the byte size of an element of the argument + // array. + // + // str gives the element stride between elements of the argument + // array. Pass 0 if you want all pipelines to get the same + // arguments. + // + // If the pipeline functions do not take arguments, use NULL for + // args and 0 for sz and str + + void ( *dispatch )( pipeline_func_t pipeline, void* args, int sz, int str ); + + // wait waits for the previous dispatch to complete. + + void ( *wait )( void ); } pipeline_dispatcher_t; @@ -83,4 +74,4 @@ extern pipeline_dispatcher_t thread; END_C_DECLS -#endif // _pipelines_pthreads_h_ +#endif // _pipelines_pthreads_h_ diff --git a/src/util/profile/profile.h b/src/util/profile/profile.h index f26de1e3..82d6b256 100644 --- a/src/util/profile/profile.h +++ b/src/util/profile/profile.h @@ -8,40 +8,40 @@ // keep dumps prettily formatted, only the first 16 characters of the // timer name is printed on profile dumps. -#define PROFILE_TIMERS(_) \ - _( clear_accumulators ) \ - _( sort_p ) \ - _( collision_model ) \ - _( advance_p ) \ - _( reduce_accumulators ) \ - _( emission_model ) \ - _( boundary_p ) \ - _( clear_jf ) \ - _( unload_accumulator ) \ - _( synchronize_jf ) \ - _( advance_b ) \ - _( advance_e ) \ - _( clear_rhof ) \ - _( accumulate_rho_p ) \ - _( synchronize_rho ) \ - _( compute_div_e_err ) \ - _( compute_rms_div_e_err ) \ - _( clean_div_e ) \ - _( compute_div_b_err ) \ - _( compute_rms_div_b_err ) \ - _( clean_div_b ) \ - _( synchronize_tang_e_norm_b ) \ - _( load_interpolator ) \ - _( compute_curl_b ) \ - _( compute_rhob ) \ - _( center_p ) \ - _( uncenter_p ) \ - _( user_initialization ) \ - _( user_particle_collisions ) \ - _( user_particle_injection ) \ - _( user_current_injection ) \ - _( user_field_injection ) \ - _( user_diagnostics ) +#define PROFILE_TIMERS( _ ) \ + _( clear_accumulators ) \ + _( sort_p ) \ + _( collision_model ) \ + _( advance_p ) \ + _( reduce_accumulators ) \ + _( emission_model ) \ + _( boundary_p ) \ + _( clear_jf ) \ + _( unload_accumulator ) \ + _( synchronize_jf ) \ + _( advance_b ) \ + _( advance_e ) \ + _( clear_rhof ) \ + _( accumulate_rho_p ) \ + _( synchronize_rho ) \ + _( compute_div_e_err ) \ + _( compute_rms_div_e_err ) \ + _( clean_div_e ) \ + _( compute_div_b_err ) \ + _( compute_rms_div_b_err ) \ + _( clean_div_b ) \ + _( synchronize_tang_e_norm_b ) \ + _( load_interpolator ) \ + _( compute_curl_b ) \ + _( compute_rhob ) \ + _( center_p ) \ + _( uncenter_p ) \ + _( user_initialization ) \ + _( user_particle_collisions ) \ + _( user_particle_injection ) \ + _( user_current_injection ) \ + _( user_field_injection ) \ + _( user_diagnostics ) // TIC / TOC are used to update the timing profile. For example: // @@ -50,33 +50,38 @@ // A TIC/TOC block is semantically a single statement (so it works // fine as the body of a for loop or an if statement. -#define TIC \ - do { \ - double _profile_tic = wallclock(); \ - do - -#define TOC(timer,n_calls) \ - while(0); \ - profile_internal_use_only[profile_internal_use_only_##timer].t += \ - wallclock() - _profile_tic; \ - profile_internal_use_only[profile_internal_use_only_##timer].n += \ - (n_calls); \ - } while(0) +#define TIC \ + do \ + { \ + double _profile_tic = wallclock(); \ + do + +#define TOC( timer, n_calls ) \ + while ( 0 ) \ + ; \ + profile_internal_use_only[profile_internal_use_only_##timer].t += \ + wallclock() - _profile_tic; \ + profile_internal_use_only[profile_internal_use_only_##timer].n += \ + ( n_calls ); \ + } \ + while ( 0 ) // Do not touch these -enum profile_internal_use_only_timers { - profile_internal_use_only_invalid_timer = -1, -# define PROFILE_INTERNAL_USE_ONLY( timer ) profile_internal_use_only_##timer, - PROFILE_TIMERS( PROFILE_INTERNAL_USE_ONLY ) -# undef PROFILE_INTERNAL_USE_ONLY - profile_internal_use_only_n_timer +enum profile_internal_use_only_timers +{ + profile_internal_use_only_invalid_timer = -1, +#define PROFILE_INTERNAL_USE_ONLY( timer ) profile_internal_use_only_##timer, + PROFILE_TIMERS( PROFILE_INTERNAL_USE_ONLY ) +#undef PROFILE_INTERNAL_USE_ONLY + profile_internal_use_only_n_timer }; -typedef struct profile_internal_use_only_timer { - const char * name; - double t, t_total; - int n, n_total; +typedef struct profile_internal_use_only_timer +{ + const char* name; + double t, t_total; + int n, n_total; } profile_internal_use_only_timer_t; extern profile_internal_use_only_timer_t profile_internal_use_only[]; @@ -86,14 +91,12 @@ BEGIN_C_DECLS // Updates the cumulative profile, resets the local profile and, if // dump is true, writes the local and cumulative profiles to the log. -void -update_profile( int dump ); +void update_profile( int dump ); // Returns a local wallclock in seconds. Only relative values are // accurate, and then only within same "short run". -double -wallclock( void ); +double wallclock( void ); END_C_DECLS diff --git a/src/util/rng/drandn_table.h b/src/util/rng/drandn_table.h index f007ab4c..097684a9 100644 --- a/src/util/rng/drandn_table.h +++ b/src/util/rng/drandn_table.h @@ -5,8 +5,8 @@ #error "Do not include drand_table.h; use rng.h" #endif -#define DRANDN_N 256 -#define DRANDN_R (3.6554204190269413594915892673498092335649e+00) +#define DRANDN_N 256 +#define DRANDN_R ( 3.6554204190269413594915892673498092335649e+00 ) extern const double drandn_zig_x[]; extern const double drandn_zig_y[]; diff --git a/src/util/rng/frandn_table.h b/src/util/rng/frandn_table.h index 9428698d..bb763649 100644 --- a/src/util/rng/frandn_table.h +++ b/src/util/rng/frandn_table.h @@ -6,7 +6,7 @@ #endif #define FRANDN_N 64 -#define FRANDN_R (3.2159292455085228233657712593185351579450e+00f) +#define FRANDN_R ( 3.2159292455085228233657712593185351579450e+00f ) extern const float frandn_zig_x[]; extern const float frandn_zig_y[]; diff --git a/src/util/rng/rng.h b/src/util/rng/rng.h index cea747e2..6ad337c2 100644 --- a/src/util/rng/rng.h +++ b/src/util/rng/rng.h @@ -10,25 +10,25 @@ typedef struct rng rng_t; /* A rng_pool is a collection of random number generators. */ -typedef struct rng_pool { - rng_t ** rng; /* Random number generators (indexed 0:n_rng-1) */ - int n_rng; /* Number of random number generators in pool */ +typedef struct rng_pool +{ + rng_t** rng; /* Random number generators (indexed 0:n_rng-1) */ + int n_rng; /* Number of random number generators in pool */ } rng_pool_t; BEGIN_C_DECLS /* In rng_pool.c. */ -rng_pool_t * /* New pool (already seeded via seed_rng_pool) */ +rng_pool_t* /* New pool (already seeded via seed_rng_pool) */ new_rng_pool( int n_rng, /* Number of generators for pool */ int seed, /* Pool seed (different meaning from seed_rng) */ int sync ); /* True for synchronized seeding */ -void -delete_rng_pool( rng_pool_t * RESTRICT rp ); /* Pool to delete */ +void delete_rng_pool( rng_pool_t* RESTRICT rp ); /* Pool to delete */ /* In seed_rng_pool, seeding is done such that: - local_pool = seed_rng_pool( rp, seed, 0 ); + local_pool = seed_rng_pool( rp, seed, 0 ); sync_pool = seed_rng_pool( rp, seed, 1 ); gives each local_pool rng and each sync_pool rng has a unique seed on all calling processes and that the sync pool rngs are @@ -37,25 +37,24 @@ delete_rng_pool( rng_pool_t * RESTRICT rp ); /* Pool to delete */ /* FIXME: WE NEED BIGGER SEEDS. NOTE THAT THE EFFECT SEED SPACE FOR POOLS IS ROUGHLY FLOOR( UINT_MAX / (n_rng*(world_size+1)) ) */ -rng_pool_t * /* Returns rp */ -seed_rng_pool( rng_pool_t * RESTRICT rp, /* Pool to seed */ - int seed, /* Pool seed (different meaning from - seed_rng) */ - int sync ); /* True for synchronized seeding */ +rng_pool_t* /* Returns rp */ +seed_rng_pool( rng_pool_t* RESTRICT rp, /* Pool to seed */ + int seed, /* Pool seed (different meaning from + seed_rng) */ + int sync ); /* True for synchronized seeding */ /* In rng.c */ -rng_t * /* New generator (already seeded via seed_rng) */ +rng_t* /* New generator (already seeded via seed_rng) */ new_rng( int seed ); /* Random number generator seed */ -void -delete_rng( rng_t * RESTRICT r ); /* Generator to delete */ +void delete_rng( rng_t* RESTRICT r ); /* Generator to delete */ /* FIXME: WE NEED TO BIGGER SEEDS. SEE ABOVE. */ -rng_t * /* Returns r */ -seed_rng( rng_t * RESTRICT r, /* Generator to seed */ - int seed ); /* Seed */ +rng_t* /* Returns r */ +seed_rng( rng_t* RESTRICT r, /* Generator to seed */ + int seed ); /* Seed */ /* Integer random generators make uniform rands on [0,INTTYPE_MAX] for signed types and on [0,UINTTYPE_MAX] for unsigned types. There are @@ -68,7 +67,7 @@ seed_rng( rng_t * RESTRICT r, /* Generator to seed */ [c,h,i,l,i8,i16,i32,i64,uc,uh,ui,ul,u8,u16,u32,u64]rand[,_fill] where the generated data type is: - c => char, uc => unsigned char, + c => char, uc => unsigned char, h => short, uh => unsigned short, i => int, ui => unsigned int, l => long, ul => unsigned long @@ -78,25 +77,23 @@ seed_rng( rng_t * RESTRICT r, /* Generator to seed */ i64 => int64_t, u64 => uint64_t and _fill indicates a mass production variant */ -#define _( prefix, type ) \ -type /* Returns sample deviate */ \ -prefix##rand( rng_t * RESTRICT r ); /* Generator to use */ \ - \ -type * /* Returns x */ \ -prefix##rand_fill( rng_t * RESTRICT r, /* Generator to use */ \ - type * RESTRICT x, /* Array to fill */ \ - size_t str_ele, /* Element stride */ \ - size_t n_ele ); /* Number of elements */ - -_( c, char ) _( uc, unsigned char ) -_( h, short ) _( uh, unsigned short ) -_( i, int ) _( ui, unsigned int ) -_( l, long ) _( ul, unsigned long ) - -_( i8, int8_t ) _( u8, uint8_t ) -_( i16, int16_t ) _( u16, uint16_t ) -_( i32, int32_t ) _( u32, uint32_t ) -_( i64, int64_t ) _( u64, uint64_t ) +#define _( prefix, type ) \ + type /* Returns sample deviate */ \ + prefix##rand( rng_t* RESTRICT r ); /* Generator to use */ \ + \ + type* /* Returns x */ \ + prefix##rand_fill( rng_t* RESTRICT r, /* Generator to use */ \ + type* RESTRICT x, /* Array to fill */ \ + size_t str_ele, /* Element stride */ \ + size_t n_ele ); /* Number of elements */ + +_( c, char ) +_( uc, unsigned char ) _( h, short ) _( uh, unsigned short ) _( i, int ) + _( ui, unsigned int ) _( l, long ) _( ul, unsigned long ) + + _( i8, int8_t ) _( u8, uint8_t ) _( i16, int16_t ) _( u16, uint16_t ) + _( i32, int32_t ) _( u32, uint32_t ) _( i64, int64_t ) + _( u64, uint64_t ) #undef _ @@ -120,7 +117,7 @@ _( i64, int64_t ) _( u64, uint64_t ) - In the closed variant, 0 or 1 can both be returned. - In the half open at 1 variant, 1 can never be returned. - In the half open at 0 variant, 0 can never be returned. - + There are single generators for each primitive floating point type and domain. Each singleton generator has a corresponding mass production generator. The singleton generators not error trapped @@ -138,83 +135,84 @@ _( i64, int64_t ) _( u64, uint64_t ) _c => [0,1] and _fill indicates a mass production variant */ -#define _( type, prefix, variant ) \ -type /* Returns sample deviate */ \ -prefix##rand##variant( rng_t * RESTRICT r ); /* Generator to use */ \ - \ -type * /* Returns x */ \ -prefix##rand##variant##_fill( rng_t * RESTRICT r, /* Generator to use */ \ - type * RESTRICT x, /* Array to fill */ \ - size_t str_ele, /* Element stride */ \ - size_t n_ele ); /* Number of elements */ - -_( float, f, ) _( double, d, ) -_( float, f, _c0 ) _( double, d, _c0 ) -_( float, f, _c1 ) _( double, d, _c1 ) -_( float, f, _c ) _( double, d, _c ) +#define _( type, prefix, variant ) \ + type /* Returns sample deviate */ \ + prefix##rand##variant( rng_t* RESTRICT r ); /* Generator to use */ \ + \ + type* /* Returns x */ \ + prefix##rand##variant##_fill( \ + rng_t* RESTRICT r, /* Generator to use */ \ + type* RESTRICT x, /* Array to fill */ \ + size_t str_ele, /* Element stride */ \ + size_t n_ele ); /* Number of elements */ + + _( float, f, ) _( double, d, ) _( float, f, _c0 ) + _( double, d, _c0 ) _( float, f, _c1 ) + _( double, d, _c1 ) _( float, f, _c ) + _( double, d, _c ) #undef _ -/* The normal generators generate a normally distributed random number - (f(x) = exp( -x^2 / 2 ) / sqrt( 2*pi ) for x in (-inf,inf)). Based - on the Ziggurat method under the hood. */ + /* The normal generators generate a normally distributed random number + (f(x) = exp( -x^2 / 2 ) / sqrt( 2*pi ) for x in (-inf,inf)). Based + on the Ziggurat method under the hood. */ -float /* Returns sample deviate */ -frandn( rng_t * RESTRICT r ); /* Generator to use */ + float /* Returns sample deviate */ + frandn( rng_t* RESTRICT r ); /* Generator to use */ -float * /* Returns x */ -frandn_fill( rng_t * RESTRICT r, /* Generator to use */ - float * RESTRICT x, /* Array to fill */ - size_t str_ele, /* Element stride */ - size_t n_ele ); /* Number of elements */ +float* /* Returns x */ +frandn_fill( rng_t* RESTRICT r, /* Generator to use */ + float* RESTRICT x, /* Array to fill */ + size_t str_ele, /* Element stride */ + size_t n_ele ); /* Number of elements */ -double /* Returns sample deviate */ -drandn( rng_t * RESTRICT r ); /* Generator to use */ +double /* Returns sample deviate */ +drandn( rng_t* RESTRICT r ); /* Generator to use */ -double * /* Returns x */ -drandn_fill( rng_t * RESTRICT r, /* Generator to use */ - double * RESTRICT x, /* Array to fill */ - size_t str_ele, /* Element stride */ - size_t n_ele ); /* Number of elements */ +double* /* Returns x */ +drandn_fill( rng_t* RESTRICT r, /* Generator to use */ + double* RESTRICT x, /* Array to fill */ + size_t str_ele, /* Element stride */ + size_t n_ele ); /* Number of elements */ /* The exponential generators generate an exponentially distributed random number (f(x) = exp(-x) for x in [0,inf). Based on the transformation method under the hood. */ - -float /* Returns sample deviate */ -frande( rng_t * RESTRICT r ); /* Generator to use */ -float * /* Returns x */ -frande_fill( rng_t * RESTRICT r, /* Generator to use */ - float * RESTRICT x, /* Array to fill */ - size_t str_ele, /* Element stride */ - size_t n_ele ); /* Number of elements */ +float /* Returns sample deviate */ +frande( rng_t* RESTRICT r ); /* Generator to use */ + +float* /* Returns x */ +frande_fill( rng_t* RESTRICT r, /* Generator to use */ + float* RESTRICT x, /* Array to fill */ + size_t str_ele, /* Element stride */ + size_t n_ele ); /* Number of elements */ -double /* Returns sample deviate */ -drande( rng_t * RESTRICT r ); /* Generator to use */ +double /* Returns sample deviate */ +drande( rng_t* RESTRICT r ); /* Generator to use */ -double * /* Returns x */ -drande_fill( rng_t * RESTRICT r, /* Generator to use */ - double * RESTRICT x, /* Array to fill */ - size_t str_ele, /* Element stride */ - size_t n_ele ); /* Number of elements */ +double* /* Returns x */ +drande_fill( rng_t* RESTRICT r, /* Generator to use */ + double* RESTRICT x, /* Array to fill */ + size_t str_ele, /* Element stride */ + size_t n_ele ); /* Number of elements */ /* Specialty generators */ -int * /* Returns x */ -randperm( rng_t * RESTRICT r, /* Generator to use */ - int * RESTRICT x, /* 0:n-1 indexed, holds a random - permutation of 0:n-1 on output */ - int n ); /* Permutation size */ +int* /* Returns x */ +randperm( rng_t* RESTRICT r, /* Generator to use */ + int* RESTRICT x, /* 0:n-1 indexed, holds a random + permutation of 0:n-1 on output */ + int n ); /* Permutation size */ /* This function is most efficient when str_ele is a integer multiple of sz_ele and sz_ele is either 0, 1, 2, 4 or 8. */ -void * /* Returns x */ -shuffle( rng_t * RESTRICT r, /* Generator to use */ - void * RESTRICT x, /* Elements to shuffle */ - size_t sz_ele, /* Element _byte_ size */ - size_t str_ele, /* Element _byte_ stride */ - size_t n_ele ); /* Number of elements */ +void* /* Returns x */ +shuffle( rng_t* RESTRICT r, /* Generator to use */ + void* RESTRICT x, /* Elements to shuffle */ + size_t sz_ele, /* Element _byte_ size */ + size_t str_ele, /* Element _byte_ stride */ + size_t n_ele ); /* Number of elements */ END_C_DECLS diff --git a/src/util/rng/rng_private.h b/src/util/rng/rng_private.h index 4b9306bb..98072e52 100644 --- a/src/util/rng/rng_private.h +++ b/src/util/rng/rng_private.h @@ -7,18 +7,20 @@ #include "rng.h" -#if defined(__SSE2__) /* Use SSE-2 accelerated version */ +#if defined( __SSE2__ ) /* Use SSE-2 accelerated version */ #include -typedef struct sfmt_128 { - __m128i u; +typedef struct sfmt_128 +{ + __m128i u; } sfmt_128_t; #else /* Use portable version */ -typedef struct sfmt_128 { - uint32_t u0, u1, u2, u3; +typedef struct sfmt_128 +{ + uint32_t u0, u1, u2, u3; } sfmt_128_t; #endif @@ -32,240 +34,291 @@ typedef struct sfmt_128 { #define SFMT_E 11213 #endif -enum sfmt_parameters { - - /* Parameter sets are defined by: - - SFMT_M = ... < floor(SFMT_E/128) ..., - SFMT_L1 = ... < 32 ..., - SFMT_L2 = ... < 4 ..., - SFMT_R1 = ... < 32 ..., - SFMT_R2 = ... < 3 ... - # define SFMT_MASK0 ((uint32_t)...) - # define SFMT_MASK1 ((uint32_t)...) - # define SFMT_MASK2 ((uint32_t)...) - # define SFMT_MASK3 ((uint32_t)...) - # define SFMT_PARITY0 ((uint32_t)...) - # define SFMT_PARITY1 ((uint32_t)...) - # define SFMT_PARITY2 ((uint32_t)...) - # define SFMT_PARITY3 ((uint32_t)...) - - The masks and parities technically can't be part of the enum - because enums hate potentially large unsigned quantities */ - -# if SFMT_E==607 /* Verified */ - - SFMT_M = 2, SFMT_L1 = 15, SFMT_L2 = 3, SFMT_R1 = 13, SFMT_R2 = 3, -# define SFMT_MASK0 ((uint32_t)0xfdff37ff) -# define SFMT_MASK1 ((uint32_t)0xef7f3f7d) -# define SFMT_MASK2 ((uint32_t)0xff777b7d) -# define SFMT_MASK3 ((uint32_t)0x7ff7fb2f) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0x00000000) -# define SFMT_PARITY3 ((uint32_t)0x5986f054) - -# elif SFMT_E==1279 /* Verified */ - - SFMT_M = 7, SFMT_L1 = 14, SFMT_L2 = 3, SFMT_R1 = 5, SFMT_R2 = 1, -# define SFMT_MASK0 ((uint32_t)0xf7fefffd) -# define SFMT_MASK1 ((uint32_t)0x7fefcfff) -# define SFMT_MASK2 ((uint32_t)0xaff3ef3f) -# define SFMT_MASK3 ((uint32_t)0xb5ffff7f) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0x00000000) -# define SFMT_PARITY3 ((uint32_t)0x20000000) - -# elif SFMT_E==2281 /* Verified */ - - SFMT_M = 12, SFMT_L1 = 19, SFMT_L2 = 1, SFMT_R1 = 5, SFMT_R2 = 1, -# define SFMT_MASK0 ((uint32_t)0xbff7ffbf) -# define SFMT_MASK1 ((uint32_t)0xfdfffffe) -# define SFMT_MASK2 ((uint32_t)0xf7ffef7f) -# define SFMT_MASK3 ((uint32_t)0xf2f7cbbf) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0x00000000) -# define SFMT_PARITY3 ((uint32_t)0x41dfa600) - -# elif SFMT_E==4253 /* Verified */ - - SFMT_M = 17, SFMT_L1 = 20, SFMT_L2 = 1, SFMT_R1 = 7, SFMT_R2 = 1, -# define SFMT_MASK0 ((uint32_t)0x9f7bffff) -# define SFMT_MASK1 ((uint32_t)0x9fffff5f) -# define SFMT_MASK2 ((uint32_t)0x3efffffb) -# define SFMT_MASK3 ((uint32_t)0xfffff7bb) -# define SFMT_PARITY0 ((uint32_t)0xa8000001) -# define SFMT_PARITY1 ((uint32_t)0xaf5390a3) -# define SFMT_PARITY2 ((uint32_t)0xb740b3f8) -# define SFMT_PARITY3 ((uint32_t)0x6c11486d) - -# elif SFMT_E==11213 /* Verified */ - - SFMT_M = 68, SFMT_L1 = 14, SFMT_L2 = 3, SFMT_R1 = 7, SFMT_R2 = 3, -# define SFMT_MASK0 ((uint32_t)0xeffff7fb) -# define SFMT_MASK1 ((uint32_t)0xffffffef) -# define SFMT_MASK2 ((uint32_t)0xdfdfbfff) -# define SFMT_MASK3 ((uint32_t)0x7fffdbfd) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0xe8148000) -# define SFMT_PARITY3 ((uint32_t)0xd0c7afa3) - -# elif SFMT_E==19937 /* Verified */ - - SFMT_M = 122, SFMT_L1 = 18, SFMT_L2 = 1, SFMT_R1 = 11, SFMT_R2 = 1, -# define SFMT_MASK0 ((uint32_t)0xdfffffef) -# define SFMT_MASK1 ((uint32_t)0xddfecb7f) -# define SFMT_MASK2 ((uint32_t)0xbffaffff) -# define SFMT_MASK3 ((uint32_t)0xbffffff6) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0x00000000) -# define SFMT_PARITY3 ((uint32_t)0x13c9e684) - -# elif SFMT_E==44497 /* Verified */ - - SFMT_M = 330, SFMT_L1 = 5, SFMT_L2 = 3, SFMT_R1 = 9, SFMT_R2 = 3, -# define SFMT_MASK0 ((uint32_t)0xeffffffb) -# define SFMT_MASK1 ((uint32_t)0xdfbebfff) -# define SFMT_MASK2 ((uint32_t)0xbfbf7bef) -# define SFMT_MASK3 ((uint32_t)0x9ffd7bff) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0xa3ac4000) -# define SFMT_PARITY3 ((uint32_t)0xecc1327a) - - /* Note: SFMT_E==86243 not supported because SFMT_L2 is too large - for this implementation */ - -# elif SFMT_E==132049 /* Verified */ - - SFMT_M = 110, SFMT_L1 = 19, SFMT_L2 = 1, SFMT_R1 = 21, SFMT_R2 = 1, -# define SFMT_MASK0 ((uint32_t)0xffffbb5f) -# define SFMT_MASK1 ((uint32_t)0xfb6ebf95) -# define SFMT_MASK2 ((uint32_t)0xfffefffa) -# define SFMT_MASK3 ((uint32_t)0xcff77fff) -# define SFMT_PARITY0 ((uint32_t)0x00000001) -# define SFMT_PARITY1 ((uint32_t)0x00000000) -# define SFMT_PARITY2 ((uint32_t)0xcb520000) -# define SFMT_PARITY3 ((uint32_t)0xc7e91c7d) - -# elif SFMT_E==216091 /* Verified */ - - SFMT_M = 627, SFMT_L1 = 11, SFMT_L2 = 3, SFMT_R1 = 10, SFMT_R2 = 1, -# define SFMT_MASK0 ((uint32_t)0xbff7bff7) -# define SFMT_MASK1 ((uint32_t)0xbfffffff) -# define SFMT_MASK2 ((uint32_t)0xbffffa7f) -# define SFMT_MASK3 ((uint32_t)0xffddfbfb) -# define SFMT_PARITY0 ((uint32_t)0xf8000001) -# define SFMT_PARITY1 ((uint32_t)0x89e80709) -# define SFMT_PARITY2 ((uint32_t)0x3bd2b64b) -# define SFMT_PARITY3 ((uint32_t)0x0c64b1e4) - -# else -# error "Unsupported SFMT exponent" -# endif - - /* Some useful derived quantities */ - - SFMT_N = SFMT_E/128 + 1, /* Number of 128-bit vectors in state */ - SFMT_NM = SFMT_N-SFMT_M, - SFMT_L2A = 8*SFMT_L2, - SFMT_R2A = 8*SFMT_R2, - SFMT_L2B = 32-SFMT_L2A, - SFMT_R2B = 32-SFMT_R2A, - - SFMT_NC = SFMT_N*sizeof(sfmt_128_t), - SFMT_NH = SFMT_NC/sizeof(unsigned short), - SFMT_NI = SFMT_NC/sizeof(unsigned int), - SFMT_NL = SFMT_NC/sizeof(unsigned long), - SFMT_N8 = SFMT_NC/sizeof(uint8_t), - SFMT_N16 = SFMT_NC/sizeof(uint16_t), - SFMT_N32 = SFMT_NC/sizeof(uint32_t), - SFMT_N64 = SFMT_NC/sizeof(uint64_t) +enum sfmt_parameters +{ + +/* Parameter sets are defined by: + + SFMT_M = ... < floor(SFMT_E/128) ..., + SFMT_L1 = ... < 32 ..., + SFMT_L2 = ... < 4 ..., + SFMT_R1 = ... < 32 ..., + SFMT_R2 = ... < 3 ... + # define SFMT_MASK0 ((uint32_t)...) + # define SFMT_MASK1 ((uint32_t)...) + # define SFMT_MASK2 ((uint32_t)...) + # define SFMT_MASK3 ((uint32_t)...) + # define SFMT_PARITY0 ((uint32_t)...) + # define SFMT_PARITY1 ((uint32_t)...) + # define SFMT_PARITY2 ((uint32_t)...) + # define SFMT_PARITY3 ((uint32_t)...) + + The masks and parities technically can't be part of the enum + because enums hate potentially large unsigned quantities */ + +#if SFMT_E == 607 /* Verified */ + + SFMT_M = 2, + SFMT_L1 = 15, + SFMT_L2 = 3, + SFMT_R1 = 13, + SFMT_R2 = 3, +#define SFMT_MASK0 ( (uint32_t)0xfdff37ff ) +#define SFMT_MASK1 ( (uint32_t)0xef7f3f7d ) +#define SFMT_MASK2 ( (uint32_t)0xff777b7d ) +#define SFMT_MASK3 ( (uint32_t)0x7ff7fb2f ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY3 ( (uint32_t)0x5986f054 ) + +#elif SFMT_E == 1279 /* Verified */ + + SFMT_M = 7, + SFMT_L1 = 14, + SFMT_L2 = 3, + SFMT_R1 = 5, + SFMT_R2 = 1, +#define SFMT_MASK0 ( (uint32_t)0xf7fefffd ) +#define SFMT_MASK1 ( (uint32_t)0x7fefcfff ) +#define SFMT_MASK2 ( (uint32_t)0xaff3ef3f ) +#define SFMT_MASK3 ( (uint32_t)0xb5ffff7f ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY3 ( (uint32_t)0x20000000 ) + +#elif SFMT_E == 2281 /* Verified */ + + SFMT_M = 12, + SFMT_L1 = 19, + SFMT_L2 = 1, + SFMT_R1 = 5, + SFMT_R2 = 1, +#define SFMT_MASK0 ( (uint32_t)0xbff7ffbf ) +#define SFMT_MASK1 ( (uint32_t)0xfdfffffe ) +#define SFMT_MASK2 ( (uint32_t)0xf7ffef7f ) +#define SFMT_MASK3 ( (uint32_t)0xf2f7cbbf ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY3 ( (uint32_t)0x41dfa600 ) + +#elif SFMT_E == 4253 /* Verified */ + + SFMT_M = 17, + SFMT_L1 = 20, + SFMT_L2 = 1, + SFMT_R1 = 7, + SFMT_R2 = 1, +#define SFMT_MASK0 ( (uint32_t)0x9f7bffff ) +#define SFMT_MASK1 ( (uint32_t)0x9fffff5f ) +#define SFMT_MASK2 ( (uint32_t)0x3efffffb ) +#define SFMT_MASK3 ( (uint32_t)0xfffff7bb ) +#define SFMT_PARITY0 ( (uint32_t)0xa8000001 ) +#define SFMT_PARITY1 ( (uint32_t)0xaf5390a3 ) +#define SFMT_PARITY2 ( (uint32_t)0xb740b3f8 ) +#define SFMT_PARITY3 ( (uint32_t)0x6c11486d ) + +#elif SFMT_E == 11213 /* Verified */ + + SFMT_M = 68, + SFMT_L1 = 14, + SFMT_L2 = 3, + SFMT_R1 = 7, + SFMT_R2 = 3, +#define SFMT_MASK0 ( (uint32_t)0xeffff7fb ) +#define SFMT_MASK1 ( (uint32_t)0xffffffef ) +#define SFMT_MASK2 ( (uint32_t)0xdfdfbfff ) +#define SFMT_MASK3 ( (uint32_t)0x7fffdbfd ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0xe8148000 ) +#define SFMT_PARITY3 ( (uint32_t)0xd0c7afa3 ) + +#elif SFMT_E == 19937 /* Verified */ + + SFMT_M = 122, + SFMT_L1 = 18, + SFMT_L2 = 1, + SFMT_R1 = 11, + SFMT_R2 = 1, +#define SFMT_MASK0 ( (uint32_t)0xdfffffef ) +#define SFMT_MASK1 ( (uint32_t)0xddfecb7f ) +#define SFMT_MASK2 ( (uint32_t)0xbffaffff ) +#define SFMT_MASK3 ( (uint32_t)0xbffffff6 ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY3 ( (uint32_t)0x13c9e684 ) + +#elif SFMT_E == 44497 /* Verified */ + + SFMT_M = 330, + SFMT_L1 = 5, + SFMT_L2 = 3, + SFMT_R1 = 9, + SFMT_R2 = 3, +#define SFMT_MASK0 ( (uint32_t)0xeffffffb ) +#define SFMT_MASK1 ( (uint32_t)0xdfbebfff ) +#define SFMT_MASK2 ( (uint32_t)0xbfbf7bef ) +#define SFMT_MASK3 ( (uint32_t)0x9ffd7bff ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0xa3ac4000 ) +#define SFMT_PARITY3 ( (uint32_t)0xecc1327a ) + +/* Note: SFMT_E==86243 not supported because SFMT_L2 is too large + for this implementation */ + +#elif SFMT_E == 132049 /* Verified */ + + SFMT_M = 110, + SFMT_L1 = 19, + SFMT_L2 = 1, + SFMT_R1 = 21, + SFMT_R2 = 1, +#define SFMT_MASK0 ( (uint32_t)0xffffbb5f ) +#define SFMT_MASK1 ( (uint32_t)0xfb6ebf95 ) +#define SFMT_MASK2 ( (uint32_t)0xfffefffa ) +#define SFMT_MASK3 ( (uint32_t)0xcff77fff ) +#define SFMT_PARITY0 ( (uint32_t)0x00000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x00000000 ) +#define SFMT_PARITY2 ( (uint32_t)0xcb520000 ) +#define SFMT_PARITY3 ( (uint32_t)0xc7e91c7d ) + +#elif SFMT_E == 216091 /* Verified */ + + SFMT_M = 627, + SFMT_L1 = 11, + SFMT_L2 = 3, + SFMT_R1 = 10, + SFMT_R2 = 1, +#define SFMT_MASK0 ( (uint32_t)0xbff7bff7 ) +#define SFMT_MASK1 ( (uint32_t)0xbfffffff ) +#define SFMT_MASK2 ( (uint32_t)0xbffffa7f ) +#define SFMT_MASK3 ( (uint32_t)0xffddfbfb ) +#define SFMT_PARITY0 ( (uint32_t)0xf8000001 ) +#define SFMT_PARITY1 ( (uint32_t)0x89e80709 ) +#define SFMT_PARITY2 ( (uint32_t)0x3bd2b64b ) +#define SFMT_PARITY3 ( (uint32_t)0x0c64b1e4 ) + +#else +#error "Unsupported SFMT exponent" +#endif + + /* Some useful derived quantities */ + + SFMT_N = SFMT_E / 128 + 1, /* Number of 128-bit vectors in state */ + SFMT_NM = SFMT_N - SFMT_M, + SFMT_L2A = 8 * SFMT_L2, + SFMT_R2A = 8 * SFMT_R2, + SFMT_L2B = 32 - SFMT_L2A, + SFMT_R2B = 32 - SFMT_R2A, + + SFMT_NC = SFMT_N * sizeof( sfmt_128_t ), + SFMT_NH = SFMT_NC / sizeof( unsigned short ), + SFMT_NI = SFMT_NC / sizeof( unsigned int ), + SFMT_NL = SFMT_NC / sizeof( unsigned long ), + SFMT_N8 = SFMT_NC / sizeof( uint8_t ), + SFMT_N16 = SFMT_NC / sizeof( uint16_t ), + SFMT_N32 = SFMT_NC / sizeof( uint32_t ), + SFMT_N64 = SFMT_NC / sizeof( uint64_t ) }; -struct rng { - union { - sfmt_128_t sfmt[ SFMT_N ]; /* Actual randgen state */ - /* Other ways to index into the state */ - unsigned char uc[ SFMT_NC ]; - unsigned short uh[ SFMT_NH ]; - unsigned int ui[ SFMT_NI ]; - unsigned long ul[ SFMT_NL ]; - uint8_t u8[ SFMT_N8 ]; - uint16_t u16[ SFMT_N16 ]; - uint32_t u32[ SFMT_N32 ]; - uint64_t u64[ SFMT_N64 ]; - } state; - uint32_t n; /* Next unextracted byte */ - uint32_t pad[3]; /* 16-byte align */ +struct rng +{ + union { + sfmt_128_t sfmt[SFMT_N]; /* Actual randgen state */ + /* Other ways to index into the state */ + unsigned char uc[SFMT_NC]; + unsigned short uh[SFMT_NH]; + unsigned int ui[SFMT_NI]; + unsigned long ul[SFMT_NL]; + uint8_t u8[SFMT_N8]; + uint16_t u16[SFMT_N16]; + uint32_t u32[SFMT_N32]; + uint64_t u64[SFMT_N64]; + } state; + uint32_t n; /* Next unextracted byte */ + uint32_t pad[3]; /* 16-byte align */ }; -#if defined(__SSE2__) +#if defined( __SSE2__ ) -# define DECL_SFMT \ - __m128i a_u, mask = _mm_setr_epi32( SFMT_MASK0, SFMT_MASK1, \ - SFMT_MASK2, SFMT_MASK3 ) +#define DECL_SFMT \ + __m128i a_u, mask = _mm_setr_epi32( SFMT_MASK0, SFMT_MASK1, SFMT_MASK2, \ + SFMT_MASK3 ) -# define SFMT( a, b, c, d ) \ - a_u = a.u; \ - a.u = _mm_xor_si128( a_u, _mm_xor_si128( \ - _mm_xor_si128( _mm_slli_si128( a_u, SFMT_L2 ), \ - _mm_and_si128( _mm_srli_epi32( b.u, SFMT_R1 ), mask ) ), \ - _mm_xor_si128( _mm_srli_si128( c.u, SFMT_R2 ), \ - _mm_slli_epi32( d.u, SFMT_L1 ) ) ) ) +#define SFMT( a, b, c, d ) \ + a_u = a.u; \ + a.u = _mm_xor_si128( \ + a_u, _mm_xor_si128( \ + _mm_xor_si128( \ + _mm_slli_si128( a_u, SFMT_L2 ), \ + _mm_and_si128( _mm_srli_epi32( b.u, SFMT_R1 ), mask ) ), \ + _mm_xor_si128( _mm_srli_si128( c.u, SFMT_R2 ), \ + _mm_slli_epi32( d.u, SFMT_L1 ) ) ) ) #else -# define DECL_SFMT \ - uint32_t x0, x1, x2, x3, y0, y1, y2, y3 - -# define SFMT( a, b, c, d ) \ - x0 = ( a.u0 << SFMT_L2A ); \ - x1 = ( a.u1 << SFMT_L2A ) | ( a.u0 >> SFMT_L2B ); \ - x2 = ( a.u2 << SFMT_L2A ) | ( a.u1 >> SFMT_L2B ); \ - x3 = ( a.u3 << SFMT_L2A ) | ( a.u2 >> SFMT_L2B ); \ - y0 = ( c.u0 >> SFMT_R2A ) | ( c.u1 << SFMT_R2B ); \ - y1 = ( c.u1 >> SFMT_R2A ) | ( c.u2 << SFMT_R2B ); \ - y2 = ( c.u2 >> SFMT_R2A ) | ( c.u3 << SFMT_R2B ); \ - y3 = ( c.u3 >> SFMT_R2A ); \ - a.u0 ^= (x0 ^ ((b.u0>>SFMT_R1)&SFMT_MASK0)) ^ (y0 ^ (d.u0<>SFMT_R1)&SFMT_MASK1)) ^ (y1 ^ (d.u1<>SFMT_R1)&SFMT_MASK2)) ^ (y2 ^ (d.u2<>SFMT_R1)&SFMT_MASK3)) ^ (y3 ^ (d.u3<> SFMT_L2B ); \ + x2 = ( a.u2 << SFMT_L2A ) | ( a.u1 >> SFMT_L2B ); \ + x3 = ( a.u3 << SFMT_L2A ) | ( a.u2 >> SFMT_L2B ); \ + y0 = ( c.u0 >> SFMT_R2A ) | ( c.u1 << SFMT_R2B ); \ + y1 = ( c.u1 >> SFMT_R2A ) | ( c.u2 << SFMT_R2B ); \ + y2 = ( c.u2 >> SFMT_R2A ) | ( c.u3 << SFMT_R2B ); \ + y3 = ( c.u3 >> SFMT_R2A ); \ + a.u0 ^= ( x0 ^ ( ( b.u0 >> SFMT_R1 ) & SFMT_MASK0 ) ) ^ \ + ( y0 ^ ( d.u0 << SFMT_L1 ) ); \ + a.u1 ^= ( x1 ^ ( ( b.u1 >> SFMT_R1 ) & SFMT_MASK1 ) ) ^ \ + ( y1 ^ ( d.u1 << SFMT_L1 ) ); \ + a.u2 ^= ( x2 ^ ( ( b.u2 >> SFMT_R1 ) & SFMT_MASK2 ) ) ^ \ + ( y2 ^ ( d.u2 << SFMT_L1 ) ); \ + a.u3 ^= ( x3 ^ ( ( b.u3 >> SFMT_R1 ) & SFMT_MASK3 ) ) ^ \ + ( y3 ^ ( d.u3 << SFMT_L1 ) ) #endif -STATIC_INLINE void -sfmt_next( sfmt_128_t * RESTRICT sfmt ) { - DECL_SFMT; - int n; - - SFMT( sfmt[0], sfmt[ SFMT_M ], sfmt[ SFMT_N-2 ], sfmt[ SFMT_N-1 ] ); - SFMT( sfmt[1], sfmt[ SFMT_M+1 ], sfmt[ SFMT_N-1 ], sfmt[ 0 ] ); - for( n=2; nstate.p[0]) aligned */ -#define RNG_NEXT( a, t, r, p, rs ) do { \ - uint32_t _n = ((r)->n + ((uint32_t)sizeof((r)->state.p[0]))-1 ) & \ - /**/ (~(((uint32_t)sizeof((r)->state.p[0]))-1)); \ - if( _n >= SFMT_NC ) sfmt_next( (r)->state.sfmt ), _n = 0; \ - (a) = ((r)->state.p[ _n/(uint32_t)sizeof((r)->state.p[0]) ] >> (rs)); \ - (r)->n = _n+(uint32_t)sizeof((r)->state.p[0]); \ - } while(0) +#define RNG_NEXT( a, t, r, p, rs ) \ + do \ + { \ + uint32_t _n = \ + ( ( r )->n + ( (uint32_t)sizeof( ( r )->state.p[0] ) ) - 1 ) & \ + /**/ ( ~( ( (uint32_t)sizeof( ( r )->state.p[0] ) ) - 1 ) ); \ + if ( _n >= SFMT_NC ) \ + sfmt_next( ( r )->state.sfmt ), _n = 0; \ + ( a ) = \ + ( ( r )->state.p[_n / (uint32_t)sizeof( ( r )->state.p[0] )] >> \ + ( rs ) ); \ + ( r )->n = _n + (uint32_t)sizeof( ( r )->state.p[0] ); \ + } while ( 0 ) /* Integer to random floating point conversions @@ -284,17 +337,21 @@ sfmt_next( sfmt_128_t * RESTRICT sfmt ) { representable number less than 1 is 1-eps/2 and for this midpoint of the lattice, the lattice spacing over the whole interval must be 2*(eps/2), rather than eps/2. - - Similar issues apply for the frand. */ -#define conv_frand(u32) ((((u32)>>9 )+0.5f )*(2.f/16777216.f )) -#define conv_frand_c0(u32) (( (u32)>>8 )*(1.f/16777216.f )) -#define conv_frand_c1(u32) ((((u32)>>8 )+1 )*(1.f/16777216.f )) -#define conv_frand_c(u32) ((((u32)>>8 )+((u32)&1))*(1.f/16777216.f )) + Similar issues apply for the frand. */ -#define conv_drand(u64) ((((u64)>>12)+0.5 )*(2. /9007199254740992.)) -#define conv_drand_c0(u64) (( (u64)>>11 )*(1. /9007199254740992.)) -#define conv_drand_c1(u64) ((((u64)>>11)+1 )*(1. /9007199254740992.)) -#define conv_drand_c(u64) ((((u64)>>11)+((u64)&1))*(1. /9007199254740992.)) +#define conv_frand( u32 ) ( ( ( ( u32 ) >> 9 ) + 0.5f ) * ( 2.f / 16777216.f ) ) +#define conv_frand_c0( u32 ) ( ( ( u32 ) >> 8 ) * ( 1.f / 16777216.f ) ) +#define conv_frand_c1( u32 ) ( ( ( ( u32 ) >> 8 ) + 1 ) * ( 1.f / 16777216.f ) ) +#define conv_frand_c( u32 ) \ + ( ( ( ( u32 ) >> 8 ) + ( (u32)&1 ) ) * ( 1.f / 16777216.f ) ) + +#define conv_drand( u64 ) \ + ( ( ( ( u64 ) >> 12 ) + 0.5 ) * ( 2. / 9007199254740992. ) ) +#define conv_drand_c0( u64 ) ( ( ( u64 ) >> 11 ) * ( 1. / 9007199254740992. ) ) +#define conv_drand_c1( u64 ) \ + ( ( ( ( u64 ) >> 11 ) + 1 ) * ( 1. / 9007199254740992. ) ) +#define conv_drand_c( u64 ) \ + ( ( ( ( u64 ) >> 11 ) + ( (u64)&1 ) ) * ( 1. / 9007199254740992. ) ) #endif /* _rng_private_h_ */ diff --git a/src/util/swap.h b/src/util/swap.h index 97b0c6ac..3bb47f6e 100644 --- a/src/util/swap.h +++ b/src/util/swap.h @@ -1,12 +1,12 @@ /* - Byte-swapping utilities + Byte-swapping utilities - Author: Benjamin Karl Bergen + Author: Benjamin Karl Bergen - $Revision$ - $LastChangedBy$ - $LastChangedDate$ - vim: set ts=3 : + $Revision$ + $LastChangedBy$ + $LastChangedDate$ + vim: set ts=3 : */ #ifndef swap_h @@ -16,184 +16,197 @@ #include "sf_interface.h" #include "species_advance.h" -#if defined(__GNUC__) - #include +#if defined( __GNUC__ ) +#include #else - #define bswap_16(x) ((((uint16_t)(x) & 0xff00u) >> 8) | \ - (((uint16_t)(x) & 0x00ffu) << 8)) - #define bswap_32(x) ((((uint32_t)(x) & 0xff000000u) >> 24) | \ - (((uint32_t)(x) & 0x00ff0000u) >> 8) | \ - (((uint32_t)(x) & 0x0000ff00u) << 8) | \ - (((uint32_t)(x) & 0x000000ffu) << 24)) - #define bswap_64(x) ((((uint64_t)(x) & 0xff00000000000000ull) >> 56) | \ - (((uint64_t)(x) & 0x00ff000000000000ull) >> 40) | \ - (((uint64_t)(x) & 0x0000ff0000000000ull) >> 24) | \ - (((uint64_t)(x) & 0x000000ff00000000ull) >> 8) | \ - (((uint64_t)(x) & 0x00000000ff000000ull) << 8) | \ - (((uint64_t)(x) & 0x0000000000ff0000ull) << 24) | \ - (((uint64_t)(x) & 0x000000000000ff00ull) << 40) | \ - (((uint64_t)(x) & 0x00000000000000ffull) << 56)) +#define bswap_16( x ) \ + ( ( ( ( uint16_t )(x)&0xff00u ) >> 8 ) | \ + ( ( ( uint16_t )(x)&0x00ffu ) << 8 ) ) +#define bswap_32( x ) \ + ( ( ( ( uint32_t )(x)&0xff000000u ) >> 24 ) | \ + ( ( ( uint32_t )(x)&0x00ff0000u ) >> 8 ) | \ + ( ( ( uint32_t )(x)&0x0000ff00u ) << 8 ) | \ + ( ( ( uint32_t )(x)&0x000000ffu ) << 24 ) ) +#define bswap_64( x ) \ + ( ( ( ( uint64_t )(x)&0xff00000000000000ull ) >> 56 ) | \ + ( ( ( uint64_t )(x)&0x00ff000000000000ull ) >> 40 ) | \ + ( ( ( uint64_t )(x)&0x0000ff0000000000ull ) >> 24 ) | \ + ( ( ( uint64_t )(x)&0x000000ff00000000ull ) >> 8 ) | \ + ( ( ( uint64_t )(x)&0x00000000ff000000ull ) << 8 ) | \ + ( ( ( uint64_t )(x)&0x0000000000ff0000ull ) << 24 ) | \ + ( ( ( uint64_t )(x)&0x000000000000ff00ull ) << 40 ) | \ + ( ( ( uint64_t )(x)&0x00000000000000ffull ) << 56 ) ) #endif // __GNUC__ #if defined __cplusplus -namespace utils { +namespace utils +{ -//template void inline swap(T & element); -void inline swap(char & element) {} +// template void inline swap(T & element); +void inline swap( char& element ) {} -void inline swap(short int & element) { - element = bswap_16(element); +void inline swap( short int& element ) +{ + element = bswap_16( element ); } // swap -//template<> void inline swap(double & element) { -void inline swap(double & element) { +// template<> void inline swap(double & element) { +void inline swap( double& element ) +{ - union type64 { - type64(double d_) : d(d_) {} + union type64 { + type64( double d_ ) + : d( d_ ) + { + } - double d; - uint64_t ui; - } t64(element); + double d; + uint64_t ui; + } t64( element ); - t64.ui = bswap_64(t64.ui); - element = t64.d; + t64.ui = bswap_64( t64.ui ); + element = t64.d; } // swap union type32 { - type32(float f_) : f(f_) {} + type32( float f_ ) + : f( f_ ) + { + } - float f; - uint32_t ui; + float f; + uint32_t ui; }; -float inline swap_float(float f) { - union type32 { - type32(float f_) : f(f_) {} +float inline swap_float( float f ) +{ + union type32 { + type32( float f_ ) + : f( f_ ) + { + } - float f; - uint32_t ui; - } t32(f); + float f; + uint32_t ui; + } t32( f ); - t32.ui = bswap_32(t32.ui); - return t32.f; + t32.ui = bswap_32( t32.ui ); + return t32.f; } // swap_float -//template<> void inline swap(uint64_t & element) { -void inline swap(uint64_t & element) { - element = bswap_64(element); -} // swap - -//template<> void inline swap(int64_t & element) { -void inline swap(int64_t & element) { - element = bswap_64(element); -} // swap - -//template<> void inline swap(float & element) { -void inline swap(float & element) { +// template<> void inline swap(uint64_t & element) { +void inline swap( uint64_t& element ) { element = bswap_64( element ); } // swap - union type32 { - type32(float f_) : f(f_) {} +// template<> void inline swap(int64_t & element) { +void inline swap( int64_t& element ) { element = bswap_64( element ); } // swap - float f; - uint32_t ui; - } t32(element); +// template<> void inline swap(float & element) { +void inline swap( float& element ) +{ - t32.ui = bswap_32(t32.ui); - element = t32.f; -} // swap - -void inline swap(uint16_t & element) { - element = bswap_16(element); -} // swap + union type32 { + type32( float f_ ) + : f( f_ ) + { + } -//template<> void inline swap(uint32_t & element) { -void inline swap(uint32_t & element) { - element = bswap_32(element); -} // swap + float f; + uint32_t ui; + } t32( element ); -//template<> void inline swap(int32_t & element) { -void inline swap(int32_t & element) { - element = bswap_32(element); + t32.ui = bswap_32( t32.ui ); + element = t32.f; } // swap -void inline swap(field_t & element) { - // electric field - utils::swap(element.ex); - utils::swap(element.ey); - utils::swap(element.ez); - utils::swap(element.div_e_err); - - // magnetic field - utils::swap(element.cbx); - utils::swap(element.cby); - utils::swap(element.cbz); - utils::swap(element.div_b_err); - - // tca field - utils::swap(element.tcax); - utils::swap(element.tcay); - utils::swap(element.tcaz); - utils::swap(element.rhob); - - // tca field - utils::swap(element.jfx); - utils::swap(element.jfy); - utils::swap(element.jfz); - utils::swap(element.rhof); - - // material - utils::swap(element.ematx); - utils::swap(element.ematy); - utils::swap(element.ematz); - utils::swap(element.nmat); - - // material - utils::swap(element.fmatx); - utils::swap(element.fmaty); - utils::swap(element.fmatz); - utils::swap(element.cmat); +void inline swap( uint16_t& element ) { element = bswap_16( element ); } // swap + +// template<> void inline swap(uint32_t & element) { +void inline swap( uint32_t& element ) { element = bswap_32( element ); } // swap + +// template<> void inline swap(int32_t & element) { +void inline swap( int32_t& element ) { element = bswap_32( element ); } // swap + +void inline swap( field_t& element ) +{ + // electric field + utils::swap( element.ex ); + utils::swap( element.ey ); + utils::swap( element.ez ); + utils::swap( element.div_e_err ); + + // magnetic field + utils::swap( element.cbx ); + utils::swap( element.cby ); + utils::swap( element.cbz ); + utils::swap( element.div_b_err ); + + // tca field + utils::swap( element.tcax ); + utils::swap( element.tcay ); + utils::swap( element.tcaz ); + utils::swap( element.rhob ); + + // tca field + utils::swap( element.jfx ); + utils::swap( element.jfy ); + utils::swap( element.jfz ); + utils::swap( element.rhof ); + + // material + utils::swap( element.ematx ); + utils::swap( element.ematy ); + utils::swap( element.ematz ); + utils::swap( element.nmat ); + + // material + utils::swap( element.fmatx ); + utils::swap( element.fmaty ); + utils::swap( element.fmatz ); + utils::swap( element.cmat ); } // swap -void inline swap(hydro_t & element) { - // current and charge - utils::swap(element.jx); - utils::swap(element.jy); - utils::swap(element.jz); - utils::swap(element.rho); - - // current and charge - utils::swap(element.px); - utils::swap(element.py); - utils::swap(element.pz); - utils::swap(element.ke); - - // stress diag - utils::swap(element.txx); - utils::swap(element.tyy); - utils::swap(element.tzz); - - // stress off-diag - utils::swap(element.tyz); - utils::swap(element.tzx); - utils::swap(element.txy); +void inline swap( hydro_t& element ) +{ + // current and charge + utils::swap( element.jx ); + utils::swap( element.jy ); + utils::swap( element.jz ); + utils::swap( element.rho ); + + // current and charge + utils::swap( element.px ); + utils::swap( element.py ); + utils::swap( element.pz ); + utils::swap( element.ke ); + + // stress diag + utils::swap( element.txx ); + utils::swap( element.tyy ); + utils::swap( element.tzz ); + + // stress off-diag + utils::swap( element.tyz ); + utils::swap( element.tzx ); + utils::swap( element.txy ); } // swap -void inline swap(particle_t & element) { - // position - utils::swap(element.dx); - utils::swap(element.dy); - utils::swap(element.dz); +void inline swap( particle_t& element ) +{ + // position + utils::swap( element.dx ); + utils::swap( element.dy ); + utils::swap( element.dz ); - // id - utils::swap(element.i); + // id + utils::swap( element.i ); - // momentum - utils::swap(element.ux); - utils::swap(element.uy); - utils::swap(element.uz); + // momentum + utils::swap( element.ux ); + utils::swap( element.uy ); + utils::swap( element.uz ); - // charge - utils::swap(element.w); + // charge + utils::swap( element.w ); } // swap } // namespace utils diff --git a/src/util/system.h b/src/util/system.h index 85edb93a..9055d122 100644 --- a/src/util/system.h +++ b/src/util/system.h @@ -13,69 +13,72 @@ #ifndef SystemRAM_h #define SystemRAM_h -#include +#include #include +#include #include #include -#include #include "util_base.h" // String to type conversion template -bool from_string(T & t, const std::string & s, - std::ios_base & (*f)(std::ios_base&)) { - std::istringstream iss(s); - return !(iss >> f >> t).fail(); +bool from_string( T& t, const std::string& s, + std::ios_base& ( *f )(std::ios_base&)) +{ + std::istringstream iss( s ); + return !( iss >> f >> t ).fail(); } // from_string - /*! - \struct SystemRAM SystemRAM.h - \brief SystemRAM provides... + \struct SystemRAM SystemRAM.h + \brief SystemRAM provides... */ struct SystemRAM - { - static inline void print_available() { - MESSAGE(("Available RAM (kilobytes): %ld", available())); - } // print_available +{ + static inline void print_available() + { + MESSAGE( ( "Available RAM (kilobytes): %ld", available() ) ); + } // print_available + + //! Report the available RAM on the system in kilobytes. + static inline uint64_t available() + { + +#if !__linux__ + ERROR( ( "SystemRAM: Unsupported Operating System!!!" ) ); +#endif - //! Report the available RAM on the system in kilobytes. - static inline uint64_t available() { - - #if !__linux__ - ERROR(("SystemRAM: Unsupported Operating System!!!")); - #endif + char buffer[81]; + std::ifstream meminfo( "/proc/meminfo", std::ifstream::in ); - char buffer[81]; - std::ifstream meminfo("/proc/meminfo", std::ifstream::in); - - // Make sure that we were able to open the file - if(meminfo.fail()) { - ERROR(("Failed opening /proc/meminfo file!!!")); - } // if + // Make sure that we were able to open the file + if ( meminfo.fail() ) + { + ERROR( ( "Failed opening /proc/meminfo file!!!" ) ); + } // if - // Get the MemFree line - meminfo.getline(buffer, 81); - meminfo.getline(buffer, 81); + // Get the MemFree line + meminfo.getline( buffer, 81 ); + meminfo.getline( buffer, 81 ); - meminfo.close(); + meminfo.close(); - // Parse out the free mem in kilobytes - std::string memfree = buffer; - size_t begin = memfree.find_first_not_of("MemFr: "); - size_t end = memfree.find(" ", begin); + // Parse out the free mem in kilobytes + std::string memfree = buffer; + size_t begin = memfree.find_first_not_of( "MemFr: " ); + size_t end = memfree.find( " ", begin ); - // Convert to size_t - uint64_t kilobytes; - if(!from_string(kilobytes, - memfree.substr(begin, end-begin), - std::dec)) { - ERROR(("String conversion to size_t failed!!!")); - } // if + // Convert to size_t + uint64_t kilobytes; + if ( !from_string( + kilobytes, memfree.substr( begin, end - begin ), std::dec ) ) + { + ERROR( ( "String conversion to size_t failed!!!" ) ); + } // if - return kilobytes; - } // available - }; // class SystemRAM + return kilobytes; + } // available +}; // class SystemRAM #endif // SystemRAM_h diff --git a/src/util/util.h b/src/util/util.h index bfd97fe3..2336d986 100644 --- a/src/util/util.h +++ b/src/util/util.h @@ -4,27 +4,24 @@ // Expose all public functionality in util. The below includes bring // in util_base.h and other low level includes automatically. -#include "v4/v4.h" -#include "v8/v8.h" -#include "v16/v16.h" #include "checkpt/checkpt.h" #include "mp/mp.h" -#include "rng/rng.h" #include "pipelines/pipelines.h" #include "profile/profile.h" +#include "rng/rng.h" +#include "v16/v16.h" +#include "v4/v4.h" +#include "v8/v8.h" BEGIN_C_DECLS // Boot all util functionality (should be the first thing in the program) -void -boot_services( int * pargc, - char *** pargv ); +void boot_services( int* pargc, char*** pargv ); // Halt all util functionality (should be the last thing in the program) -void -halt_services( void ); +void halt_services( void ); // Give an estimate of the time when boot_timestamp was called // (in seconds since the epoch). All processes agree on this. @@ -36,8 +33,7 @@ extern double _boot_timestamp; // called (in seconds). This call must be loosly synchronous over // all processes; all processes agree on the result. -double -uptime( void ); +double uptime( void ); END_C_DECLS diff --git a/src/util/util_base.h b/src/util/util_base.h index bc9db329..b71fe043 100644 --- a/src/util/util_base.h +++ b/src/util/util_base.h @@ -1,4 +1,4 @@ -/* +/* * Written by: * Kevin J. Bowers, Ph.D. * Plasma Physics Group (X-1) @@ -12,7 +12,9 @@ #define _util_base_h_ #ifdef __cplusplus -#define BEGIN_C_DECLS extern "C" { +#define BEGIN_C_DECLS \ + extern "C" \ + { #define END_C_DECLS } #else @@ -21,7 +23,7 @@ #endif // C99 does requires some key macros of stdint to only be defined in -// C++ implementations if explicitly requested. +// C++ implementations if explicitly requested. #define __STDC_LIMIT_MACROS @@ -33,12 +35,12 @@ #define __STDC_CONSTANT_MACROS +#include // For floating point limits +#include // For integer limits +#include // For math prototypes +#include // For fixed width integer types #include // For exit, size_t, NULL #include // For string and memory manipulation -#include // For fixed width integer types -#include // For math prototypes -#include // For integer limits -#include // For floating point limits // Opaque handle of a set of communicating processes @@ -48,10 +50,10 @@ typedef struct collective collective_t; // These macros facilitate doing evil tricks #define BEGIN_PRIMITIVE do -#define END_PRIMITIVE while(0) +#define END_PRIMITIVE while ( 0 ) -#define _UTIL_STRINGIFY(s)#s -#define EXPAND_AND_STRINGIFY(s)_UTIL_STRINGIFY(s) +#define _UTIL_STRINGIFY( s ) #s +#define EXPAND_AND_STRINGIFY( s ) _UTIL_STRINGIFY( s ) // Function inlining @@ -70,14 +72,14 @@ typedef struct collective collective_t; // at compile time, this optimization will be disabled. #ifdef NO_BRANCH_HINTS -#define LIKLEY(_c) _c -#define UNLIKLEY(_c) _c +#define LIKLEY( _c ) _c +#define UNLIKLEY( _c ) _c #else #ifndef LIKELY -#define LIKELY(_c) __builtin_expect((_c),1) +#define LIKELY( _c ) __builtin_expect( ( _c ), 1 ) #endif #ifndef UNLIKELY -#define UNLIKELY(_c) __builtin_expect((_c),0) +#define UNLIKELY( _c ) __builtin_expect( ( _c ), 0 ) #endif #endif @@ -91,7 +93,7 @@ typedef struct collective collective_t; // per-platform basis #ifndef ALIGNED -#define ALIGNED(a) +#define ALIGNED( a ) #endif // This pointer modifier indicates that a pointer is restricted in @@ -102,7 +104,7 @@ typedef struct collective collective_t; #ifndef RESTRICT #define RESTRICT __restrict -#endif +#endif // Normal pointers (e.g. a *) are in whatever address space the given // compile unit uses. However, sometimes it is necessary to declare @@ -112,11 +114,11 @@ typedef struct collective collective_t; // both the SPU and PPU with appropriate annotations to necessary // write the appropriate DMA transfers. -# define MEM_PTR(type,align) type * ALIGNED(align) +#define MEM_PTR( type, align ) type* ALIGNED( align ) // The SIZEOF_MEM_PTR macro gives the number of bytes taken by a MEM_PTR. -#define SIZEOF_MEM_PTR sizeof(MEM_PTR(void,1)) +#define SIZEOF_MEM_PTR sizeof( MEM_PTR( void, 1 ) ) // DECLARE_ALIGNED_ARRAY declares an array containing count elements // with the given alignment in memory. The scope of the array is the @@ -135,13 +137,13 @@ typedef struct collective collective_t; // align should be a power of two. #if 0 // C99 has (dubious) issues with this -#define DECLARE_ALIGNED_ARRAY(type,align,name,count) \ - char _aa_##name[(count)*sizeof(type)+(align)]; \ - type * ALIGNED(align) const name = (type * ALIGNED(align)) \ - ( ( (size_t)_aa_##name + (align) - 1 ) & (~((align)-1)) ) +#define DECLARE_ALIGNED_ARRAY( type, align, name, count ) \ + char _aa_##name[( count ) * sizeof( type ) + ( align )]; \ + type* ALIGNED( align ) const name = ( type * ALIGNED( align ) )( \ + ( (size_t)_aa_##name + (align)-1 ) & ( ~( (align)-1 ) ) ) #else // Sigh ... this is technically not portable -#define DECLARE_ALIGNED_ARRAY(type,align,name,count) \ - type name[(count)] __attribute__ ((aligned (align))) +#define DECLARE_ALIGNED_ARRAY( type, align, name, count ) \ + type name[( count )] __attribute__( ( aligned( align ) ) ) #endif // PAD(s,a) computes the amount of bytes necessary to add to "s" bytes @@ -154,20 +156,21 @@ typedef struct collective collective_t; // allow correct autogeneration when no alignment necessary ... sigh // ... -#define PAD(s,a) ( (a) - ( (s) & ( (a)-1 ) ) ) +#define PAD( s, a ) ( ( a ) - ( ( s ) & ( (a)-1 ) ) ) // POW2_CEIL rounds "u" up to the nearest multiple of the power of two // "a". If u is a multiple of "a", its value is unchanged. "a" should // be safe against multiple dereferencing and the same type as "u". -#define POW2_CEIL(u,a) ( ((u)+(a)-1) & (~((a)-1)) ) +#define POW2_CEIL( u, a ) ( ( ( u ) + (a)-1 ) & ( ~( (a)-1 ) ) ) // ALIGN_PTR rounds "p" up to the nearest multiple of the power of two // "a". If p is a multiple of "a", its value is unchanged. "a" should // be safe against multiple dereferencing. The result is cast to a // pointer of type "t". -#define ALIGN_PTR(t,p,a) ((t *)POW2_CEIL( (size_t)(p), (size_t)(a) )) +#define ALIGN_PTR( t, p, a ) \ + ( (t*)POW2_CEIL( ( size_t )( p ), ( size_t )( a ) ) ) // Workload distribution macros @@ -188,50 +191,60 @@ typedef struct collective collective_t; // salt will replace the divison and modulo with bit shifts // and masks for power-of-two block sizes. -#define DISTRIBUTE( N, b, p, P, i, n ) BEGIN_PRIMITIVE { \ - int _N = (N), _b = (b), _p = (p), _P = (P); \ - double _t = (double)(_N/_b)/(double)_P; \ - int _i = _b*(int)(_t*(double) _p +0.5); \ - (n) = (_p==_P) ? (_N%_b) : (_b*(int)(_t*(double)(_p+1)+0.5)-_i); \ - (i) = _i; \ - } END_PRIMITIVE +#define DISTRIBUTE( N, b, p, P, i, n ) \ + BEGIN_PRIMITIVE \ + { \ + int _N = ( N ), _b = ( b ), _p = ( p ), _P = ( P ); \ + double _t = (double)( _N / _b ) / (double)_P; \ + int _i = _b * (int)( _t * (double)_p + 0.5 ); \ + ( n ) = ( _p == _P ) \ + ? ( _N % _b ) \ + : ( _b * (int)( _t * (double)( _p + 1 ) + 0.5 ) - _i ); \ + ( i ) = _i; \ + } \ + END_PRIMITIVE // INDEX_FORTRAN_x and INDEX_C_x give macros for accessing // multi-dimensional arrays with different conventions. To eliminate // potential side effects and maximize optimization possibilites, xl, // xh, yl, yh, zl, zh should be local constant ints -#define INDEX_FORTRAN_1(x,xl,xh) \ - ((x)-(xl)) -#define INDEX_FORTRAN_2(x,y,xl,xh,yl,yh) \ - ((x)-(xl) + ((xh)-(xl)+1)*((y)-(yl))) -#define INDEX_FORTRAN_3(x,y,z,xl,xh,yl,yh,zl,zh) \ - ((x)-(xl) + ((xh)-(xl)+1)*((y)-(yl) + ((yh)-(yl)+1)*((z)-(zl)))) - -#define INDEX_C_1(x,xl,xh) \ - ((x)-(xl)) -#define INDEX_C_2(x,y,xl,xh,yl,yh) \ - ((y)-(yl) + ((yh)-(yl)+1)*((x)-(xl))) -#define INDEX_C_3(x,y,z,xl,xh,yl,yh,zl,zh) \ - ((z)-(zl) + ((zh)-(zl)+1)*((y)-(yl) + ((yh)-(yl)+1)*((x)-(xl)))) +#define INDEX_FORTRAN_1( x, xl, xh ) ( ( x ) - ( xl ) ) +#define INDEX_FORTRAN_2( x, y, xl, xh, yl, yh ) \ + ( ( x ) - ( xl ) + ( ( xh ) - ( xl ) + 1 ) * ( ( y ) - ( yl ) ) ) +#define INDEX_FORTRAN_3( x, y, z, xl, xh, yl, yh, zl, zh ) \ + ( ( x ) - ( xl ) + \ + ( ( xh ) - ( xl ) + 1 ) * \ + ( ( y ) - ( yl ) + ( ( yh ) - ( yl ) + 1 ) * ( ( z ) - ( zl ) ) ) ) + +#define INDEX_C_1( x, xl, xh ) ( ( x ) - ( xl ) ) +#define INDEX_C_2( x, y, xl, xh, yl, yh ) \ + ( ( y ) - ( yl ) + ( ( yh ) - ( yl ) + 1 ) * ( ( x ) - ( xl ) ) ) +#define INDEX_C_3( x, y, z, xl, xh, yl, yh, zl, zh ) \ + ( ( z ) - ( zl ) + \ + ( ( zh ) - ( zl ) + 1 ) * \ + ( ( y ) - ( yl ) + ( ( yh ) - ( yl ) + 1 ) * ( ( x ) - ( xl ) ) ) ) // The following macros deal with linked lists -#define LIST_FOR_EACH(node,list) \ - for((node)=(list); (node); (node)=(node)->next) +#define LIST_FOR_EACH( node, list ) \ + for ( ( node ) = ( list ); ( node ); ( node ) = ( node )->next ) -#define LIST_FIND_FIRST(node,list,cond) do { \ - for((node)=(list); (node); (node)=(node)->next) \ - if(cond) break; \ - } while(0) +#define LIST_FIND_FIRST( node, list, cond ) \ + do \ + { \ + for ( ( node ) = ( list ); ( node ); ( node ) = ( node )->next ) \ + if ( cond ) \ + break; \ + } while ( 0 ) // Given an integer data type "type", MASK_BIT_RANGE returns a bit // field of that type for which bits [f,l] inclusive are 1 and all // other bits are zero. Note that: 0<=f<=l0 ) memcpy( (d), (s), _sz ); } while(0) -#define MOVE( d, s, n ) do { size_t _sz = (n)*sizeof(*(d)); if( _sz>0 ) memmove( (d), (s), _sz ); } while(0) -#define CLEAR( d, n ) do { size_t _sz = (n)*sizeof(*(d)); if( _sz>0 ) memset( (d), 0, _sz ); } while(0) +#define COPY( d, s, n ) \ + do \ + { \ + size_t _sz = ( n ) * sizeof( *( d ) ); \ + if ( _sz > 0 ) \ + memcpy( ( d ), ( s ), _sz ); \ + } while ( 0 ) +#define MOVE( d, s, n ) \ + do \ + { \ + size_t _sz = ( n ) * sizeof( *( d ) ); \ + if ( _sz > 0 ) \ + memmove( ( d ), ( s ), _sz ); \ + } while ( 0 ) +#define CLEAR( d, n ) \ + do \ + { \ + size_t _sz = ( n ) * sizeof( *( d ) ); \ + if ( _sz > 0 ) \ + memset( ( d ), 0, _sz ); \ + } while ( 0 ) BEGIN_C_DECLS @@ -285,22 +322,19 @@ BEGIN_C_DECLS // (The macros turn these into rvals that can't be modified // by users accidentically). -#define world ((collective_t *)_world) -extern collective_t * _world; +#define world ( (collective_t*)_world ) +extern collective_t* _world; -#define world_size ((int)_world_size) +#define world_size ( (int)_world_size ) extern int _world_size; -#define world_rank ((int)_world_rank) +#define world_rank ( (int)_world_rank ) extern int _world_rank; // Strip all instances of key from the command line. Returns the // number of times key was found. -int -strip_cmdline( int * pargc, - char *** pargv, - const char * key ); +int strip_cmdline( int* pargc, char*** pargv, const char* key ); // Strip all instances of "key val" from the command line. Returns // val as an int of the last complete "key val" pair (if the last @@ -308,32 +342,23 @@ strip_cmdline( int * pargc, // otherwise ignored). If there are no instances of "key val" on // the command line, returns default_val. -int -strip_cmdline_int( int * pargc, - char *** pargv, - const char * key, - int default_val ); +int strip_cmdline_int( int* pargc, char*** pargv, const char* key, + int default_val ); // Same as strip_cmdline_int, but for doubles -double -strip_cmdline_double( int * pargc, - char *** pargv, - const char * key, - double default_val ); +double strip_cmdline_double( int* pargc, char*** pargv, const char* key, + double default_val ); // Same as strip_cmdline_int, but for strings. The lifetime of the // returned '\0'-terminated string is the shorter of the lifetime of // default_val or pargv. -const char * -strip_cmdline_string( int * pargc, - char *** pargv, - const char * key, - const char * default_val ); +const char* strip_cmdline_string( int* pargc, char*** pargv, const char* key, + const char* default_val ); // In util.c -void detect_old_style_arguments(int* pargc, char *** pargv); +void detect_old_style_arguments( int* pargc, char*** pargv ); // MALLOC is guaranteed to succeed from the caller's point of view // (thus, _no_ NULL checking the pointer is necessary). n is the @@ -341,61 +366,50 @@ void detect_old_style_arguments(int* pargc, char *** pargv); // of bytes to allocate). n==0 is a request for no elements and x is // set NULL as a result. -#define MALLOC(x,n) \ - util_malloc( "MALLOC( "#x", "#n" (%lu bytes) ) at " \ - __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ") failed", \ - &(x), (n)*sizeof(*(x)) ) +#define MALLOC( x, n ) \ + util_malloc( "MALLOC( " #x ", " #n " (%lu bytes) ) at " __FILE__ \ + "(" EXPAND_AND_STRINGIFY( __LINE__ ) ") failed", \ + &( x ), ( n ) * sizeof( *( x ) ) ) -void -util_malloc( const char * err_fmt, // Has exactly one %lu in it - void * mem_ref, - size_t n ); +void util_malloc( const char* err_fmt, // Has exactly one %lu in it + void* mem_ref, size_t n ); // FREE frees memory allocated via MALLOC above. It is safe to pass // any value returned by MALLOC to FREE (_including_ a null pointer). // The pointer to the memory will be set to NULL to indicate that it // no longer points to anything. -#define FREE(x) util_free(&(x)) +#define FREE( x ) util_free( &( x ) ) -void -util_free( void * mem_ref ); +void util_free( void* mem_ref ); // MALLOC_ALIGNED behaves equivalently to MALLOC. The alignment must // be a power of two. Alignments smaller than 16 will be rounded up // to 16. -#define MALLOC_ALIGNED(x,n,a) \ - util_malloc_aligned( "MALLOC_ALIGNED( "#x", " \ - #n" (%lu bytes), " \ - #a" (%lu bytes) ) at " \ - __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ") failed", \ - &(x), (n)*sizeof(*(x)), (a) ) - +#define MALLOC_ALIGNED( x, n, a ) \ + util_malloc_aligned( "MALLOC_ALIGNED( " #x ", " #n " (%lu bytes), " #a \ + " (%lu bytes) ) at " __FILE__ \ + "(" EXPAND_AND_STRINGIFY( __LINE__ ) ") failed", \ + &( x ), ( n ) * sizeof( *( x ) ), ( a ) ) -void -util_malloc_aligned( const char * err_fmt, // Has exactly two %lu in it - void * mem_ref, - size_t n, - size_t a ); +void util_malloc_aligned( const char* err_fmt, // Has exactly two %lu in it + void* mem_ref, size_t n, size_t a ); // FREE_ALIGNED behaves equivalently to FREE. -#define FREE_ALIGNED(x) util_free_aligned(&(x)) +#define FREE_ALIGNED( x ) util_free_aligned( &( x ) ) -void -util_free_aligned( void * mem_ref ); +void util_free_aligned( void* mem_ref ); -void -log_printf( const char *fmt, ... ); +void log_printf( const char* fmt, ... ); // This function returns a value to prevent the compiler from // optimizing it away the function body. The caller should not use it // though so the declaration casts away the return. -#define nanodelay(i) ((void)_nanodelay(i)) -uint32_t -_nanodelay( uint32_t i ); +#define nanodelay( i ) ( (void)_nanodelay( i ) ) +uint32_t _nanodelay( uint32_t i ); END_C_DECLS diff --git a/src/util/v16/v16.h b/src/util/v16/v16.h index 0bf52264..d8818765 100644 --- a/src/util/v16/v16.h +++ b/src/util/v16/v16.h @@ -4,11 +4,11 @@ #define IN_v16_h /* FIXME: SHOULDN'T THIS INCLUDE UTIL_BASE.H? */ #ifdef __cplusplus -# if defined USE_V16_PORTABLE -# include "v16_portable.h" -# elif defined USE_V16_AVX512 -# include "v16_avx512.h" -# endif +#if defined USE_V16_PORTABLE +#include "v16_portable.h" +#elif defined USE_V16_AVX512 +#include "v16_avx512.h" +#endif #endif #undef IN_v16_h #endif // _v16_h_ diff --git a/src/util/v16/v16_avx512.h b/src/util/v16/v16_avx512.h index 69d0922d..87cbcdbe 100644 --- a/src/util/v16/v16_avx512.h +++ b/src/util/v16/v16_avx512.h @@ -12,399 +12,300 @@ #include #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v16 { - class v16; - class v16int; - class v16float; +class v16; +class v16int; +class v16float; - //////////////// - // v16 base class +//////////////// +// v16 base class - class v16 - { +class v16 +{ friend class v16int; friend class v16float; // v16 miscellaneous friends - friend inline int any( const v16 &a ) ALWAYS_INLINE; - friend inline int all( const v16 &a ) ALWAYS_INLINE; + friend inline int any( const v16& a ) ALWAYS_INLINE; + friend inline int all( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 splat( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 splat( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 shuffle( const v16& a ) ALWAYS_INLINE; - friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE; + friend inline void swap( v16& a, v16& b ) ALWAYS_INLINE; + friend inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, + v16& a04, v16& a05, v16& a06, v16& a07, + v16& a08, v16& a09, v16& a10, v16& a11, + v16& a12, v16& a13, v16& a14, + v16& a15 ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& a, + const v16& b ) ALWAYS_INLINE; // v16 memory manipulation friends - friend inline void load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE; - friend inline void store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE; - friend inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) ALWAYS_INLINE; - friend inline void swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE; + friend inline void load_16x1( const void* ALIGNED( 64 ) p, + v16& a ) ALWAYS_INLINE; + friend inline void store_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void stream_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void clear_16x1( void* ALIGNED( 64 ) dst ) ALWAYS_INLINE; + friend inline void copy_16x1( void* ALIGNED( 64 ) dst, + const void* ALIGNED( 64 ) src ) ALWAYS_INLINE; + friend inline void swap_16x1( void* ALIGNED( 64 ) a, + void* ALIGNED( 64 ) b ) ALWAYS_INLINE; // v16 transposed memory manipulation friends // Note: Half aligned values are permissible in the 16x2_tr variants. - friend inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) ALWAYS_INLINE; - friend inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void load_16x2_bc( const void * ALIGNED(8) a00, - v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE; - friend inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE; - friend inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - friend inline void load_16x16_bc( const void * ALIGNED(64) a00, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - - friend inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE; - friend inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, - void * ALIGNED(8) a01, - void * ALIGNED(8) a02, - void * ALIGNED(8) a03, - void * ALIGNED(8) a04, - void * ALIGNED(8) a05, - void * ALIGNED(8) a06, - void * ALIGNED(8) a07, - void * ALIGNED(8) a08, - void * ALIGNED(8) a09, - void * ALIGNED(8) a10, - void * ALIGNED(8) a11, - void * ALIGNED(8) a12, - void * ALIGNED(8) a13, - void * ALIGNED(8) a14, - void * ALIGNED(8) a15 ) ALWAYS_INLINE; - friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x4_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x16_tr( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr_p( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) ALWAYS_INLINE; - friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; + friend inline void + load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) ALWAYS_INLINE; + friend inline void + load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) ALWAYS_INLINE; + friend inline void load_16x2_bc( const void* ALIGNED( 8 ) a00, v16& a, + v16& b ) ALWAYS_INLINE; + friend inline void + load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) ALWAYS_INLINE; + friend inline void + load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) ALWAYS_INLINE; + friend inline void + load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, + v16& h ) ALWAYS_INLINE; + friend inline void + load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) ALWAYS_INLINE; + friend inline void load_16x16_bc( const void* ALIGNED( 64 ) a00, v16& b00, + v16& b01, v16& b02, v16& b03, v16& b04, + v16& b05, v16& b06, v16& b07, v16& b08, + v16& b09, v16& b10, v16& b11, v16& b12, + v16& b13, v16& b14, + v16& b15 ) ALWAYS_INLINE; + friend inline void load_16x8_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, v16& a, + v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) ALWAYS_INLINE; + friend inline void load_16x16_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, v16& b00, + v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, v16& b06, v16& b07, + v16& b08, v16& b09, v16& b10, v16& b11, v16& b12, v16& b13, v16& b14, + v16& b15 ) ALWAYS_INLINE; + + friend inline void store_16x1_tr( const v16& a, void* a00, void* a01, + void* a02, void* a03, void* a04, + void* a05, void* a06, void* a07, + void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, + void* a14, void* a15 ) ALWAYS_INLINE; + friend inline void store_16x2_tr( + const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, void* ALIGNED( 8 ) a03, + void* ALIGNED( 8 ) a04, void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, void* ALIGNED( 8 ) a09, + void* ALIGNED( 8 ) a10, void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x4_tr( const v16& a, const v16& b, const v16& c, const v16& d, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr_p( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07 ) ALWAYS_INLINE; + friend inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; protected: - - union - { - int i[16]; - float f[16]; - __m512 v; + union { + int i[16]; + float f[16]; + __m512 v; }; public: + v16() {} // Default constructor - v16() {} // Default constructor - - v16( const v16 &a ) // Copy constructor + v16( const v16& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v16() {} // Default destructor - }; + ~v16() {} // Default destructor +}; - // v16 miscellaneous functions +// v16 miscellaneous functions - inline int any( const v16 &a ) - { - return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] || - a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] || - a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] || - a.i[12] || a.i[13] || a.i[14] || a.i[15]; - } +inline int any( const v16& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7] || a.i[8] || a.i[9] || a.i[10] || a.i[11] || a.i[12] || + a.i[13] || a.i[14] || a.i[15]; +} - inline int all( const v16 &a ) - { - return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] && - a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] && - a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] && - a.i[12] && a.i[13] && a.i[14] && a.i[15]; - } +inline int all( const v16& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7] && a.i[8] && a.i[9] && a.i[10] && a.i[11] && a.i[12] && + a.i[13] && a.i[14] && a.i[15]; +} - template - inline v16 splat( const v16 & a ) - { +template +inline v16 splat( const v16& a ) +{ v16 b; b.v = _mm512_set1_ps( a.v[n] ); return b; - } +} - template - inline v16 shuffle( const v16 & a ) - { +template +inline v16 shuffle( const v16& a ) +{ v16 b; - b.i[ 0] = a.i[i00]; - b.i[ 1] = a.i[i01]; - b.i[ 2] = a.i[i02]; - b.i[ 3] = a.i[i03]; - b.i[ 4] = a.i[i04]; - b.i[ 5] = a.i[i05]; - b.i[ 6] = a.i[i06]; - b.i[ 7] = a.i[i07]; - b.i[ 8] = a.i[i08]; - b.i[ 9] = a.i[i09]; + b.i[0] = a.i[i00]; + b.i[1] = a.i[i01]; + b.i[2] = a.i[i02]; + b.i[3] = a.i[i03]; + b.i[4] = a.i[i04]; + b.i[5] = a.i[i05]; + b.i[6] = a.i[i06]; + b.i[7] = a.i[i07]; + b.i[8] = a.i[i08]; + b.i[9] = a.i[i09]; b.i[10] = a.i[i10]; b.i[11] = a.i[i11]; b.i[12] = a.i[i12]; @@ -413,98 +314,155 @@ namespace v16 b.i[15] = a.i[i15]; return b; - } +} -# define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v16 &a, v16 &b ) - { +inline void swap( v16& a, v16& b ) +{ __m512 a_v = a.v; a.v = b.v; b.v = a_v; - } +} - inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - - // Start a00 = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - // a01 = 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - // a02 = 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - // a03 = 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - // a04 = 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - // a05 = 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - // a06 = 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - // a07 = 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - // a08 = 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - // a09 = 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - // a10 = 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - // a11 = 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - // a12 = 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - // a13 = 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - // a14 = 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - // a15 = 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( a00.v, a01.v ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( a00.v, a01.v ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( a02.v, a03.v ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( a02.v, a03.v ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( a04.v, a05.v ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( a04.v, a05.v ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( a06.v, a07.v ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( a06.v, a07.v ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( a08.v, a09.v ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( a08.v, a09.v ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( a10.v, a11.v ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( a10.v, a11.v ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( a12.v, a13.v ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( a12.v, a13.v ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( a14.v, a15.v ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( a14.v, a15.v ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - a00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 - a01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 - a02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 - a03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 - a04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 - a05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 - a06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 - a07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 - a08.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 - a09.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 - a10.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 - a11.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 - a12.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 228 240 - a13.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 229 241 - a14.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 230 242 - a15.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 231 243 - - t00 = _mm512_shuffle_f32x4( a00.v, a04.v, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 ... - t01 = _mm512_shuffle_f32x4( a01.v, a05.v, 0x88 ); // 1 17 33 49 ... - t02 = _mm512_shuffle_f32x4( a02.v, a06.v, 0x88 ); // 2 18 34 50 ... - t03 = _mm512_shuffle_f32x4( a03.v, a07.v, 0x88 ); // 3 19 35 51 ... - t04 = _mm512_shuffle_f32x4( a00.v, a04.v, 0xdd ); // 4 20 36 52 ... - t05 = _mm512_shuffle_f32x4( a01.v, a05.v, 0xdd ); // 5 21 37 53 ... - t06 = _mm512_shuffle_f32x4( a02.v, a06.v, 0xdd ); // 6 22 38 54 ... - t07 = _mm512_shuffle_f32x4( a03.v, a07.v, 0xdd ); // 7 23 39 55 ... - t08 = _mm512_shuffle_f32x4( a08.v, a12.v, 0x88 ); // 128 144 160 176 ... - t09 = _mm512_shuffle_f32x4( a09.v, a13.v, 0x88 ); // 129 145 161 177 ... - t10 = _mm512_shuffle_f32x4( a10.v, a14.v, 0x88 ); // 130 146 162 178 ... - t11 = _mm512_shuffle_f32x4( a11.v, a15.v, 0x88 ); // 131 147 163 179 ... - t12 = _mm512_shuffle_f32x4( a08.v, a12.v, 0xdd ); // 132 148 164 180 ... - t13 = _mm512_shuffle_f32x4( a09.v, a13.v, 0xdd ); // 133 149 165 181 ... - t14 = _mm512_shuffle_f32x4( a10.v, a14.v, 0xdd ); // 134 150 166 182 ... - t15 = _mm512_shuffle_f32x4( a11.v, a15.v, 0xdd ); // 135 151 167 183 ... - - a00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 ... 240 - a01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 ... 241 - a02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 ... 242 - a03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 ... 243 +inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, v16& a04, + v16& a05, v16& a06, v16& a07, v16& a08, v16& a09, + v16& a10, v16& a11, v16& a12, v16& a13, v16& a14, + v16& a15 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + + // Start a00 = 0 1 2 3 4 5 6 + // 7 8 9 10 11 12 13 14 15 + // a01 = 16 17 18 19 20 21 22 + // 23 24 25 26 27 28 29 30 31 + // a02 = 32 33 34 35 36 37 38 + // 39 40 41 42 43 44 45 46 47 + // a03 = 48 49 50 51 52 53 54 + // 55 56 57 58 59 60 61 62 63 + // a04 = 64 65 66 67 68 69 70 + // 71 72 73 74 75 76 77 78 79 + // a05 = 80 81 82 83 84 85 86 + // 87 88 89 90 91 92 93 94 95 + // a06 = 96 97 98 99 100 101 102 + // 103 104 105 106 107 108 109 110 111 + // a07 = 112 113 114 115 116 117 118 + // 119 120 121 122 123 124 125 126 127 + // a08 = 128 129 130 131 132 133 134 + // 135 136 137 138 139 140 141 142 143 + // a09 = 144 145 146 147 148 149 150 + // 151 152 153 154 155 156 157 158 159 + // a10 = 160 161 162 163 164 165 166 + // 167 168 169 170 171 172 173 174 175 + // a11 = 176 177 178 179 180 181 182 + // 183 184 185 186 187 188 189 190 191 + // a12 = 192 193 194 195 196 197 198 + // 199 200 201 202 203 204 205 206 207 + // a13 = 208 209 210 211 212 213 214 + // 215 216 217 218 219 220 221 222 223 + // a14 = 224 225 226 227 228 229 230 + // 231 232 233 234 235 236 237 238 239 + // a15 = 240 241 242 243 244 245 246 + // 247 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( a00.v, a01.v ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( a00.v, a01.v ); // 2 18 3 19 6 22 7 23 + // 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( a02.v, a03.v ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( a02.v, a03.v ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( a04.v, a05.v ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( a04.v, a05.v ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = + _mm512_unpacklo_ps( a06.v, a07.v ); // 96 112 97 113 100 116 101 117 + // 104 120 105 121 108 124 109 125 + t07 = + _mm512_unpackhi_ps( a06.v, a07.v ); // 98 114 99 115 102 118 103 119 + // 106 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( a08.v, a09.v ); // 128 144 129 145 132 148 133 149 + // 136 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( a08.v, a09.v ); // 130 146 131 147 134 150 135 151 + // 138 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( a10.v, a11.v ); // 160 176 161 177 164 180 165 181 + // 168 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( a10.v, a11.v ); // 162 178 163 179 166 182 167 183 + // 170 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( a12.v, a13.v ); // 192 208 193 209 196 212 197 213 + // 200 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( a12.v, a13.v ); // 194 210 195 211 198 214 199 215 + // 202 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( a14.v, a15.v ); // 224 240 225 241 228 244 229 245 + // 232 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( a14.v, a15.v ); // 226 242 227 243 230 246 231 247 + // 234 250 235 251 238 254 239 255 + + a00.v = _mm512_shuffle_ps( t00, t02, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 + a01.v = _mm512_shuffle_ps( t00, t02, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 + a02.v = _mm512_shuffle_ps( t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 + a03.v = _mm512_shuffle_ps( t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 + a04.v = _mm512_shuffle_ps( t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 + a05.v = _mm512_shuffle_ps( t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 + a06.v = _mm512_shuffle_ps( t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 + a07.v = _mm512_shuffle_ps( t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 + a08.v = _mm512_shuffle_ps( t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 + a09.v = _mm512_shuffle_ps( t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 + a10.v = _mm512_shuffle_ps( t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 + a11.v = _mm512_shuffle_ps( t09, t11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 + a12.v = _mm512_shuffle_ps( t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 228 240 + a13.v = _mm512_shuffle_ps( t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 229 241 + a14.v = _mm512_shuffle_ps( t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 230 242 + a15.v = _mm512_shuffle_ps( t13, t15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 231 243 + + t00 = _mm512_shuffle_f32x4( + a00.v, a04.v, + 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 ... + t01 = _mm512_shuffle_f32x4( a01.v, a05.v, 0x88 ); // 1 17 33 49 ... + t02 = _mm512_shuffle_f32x4( a02.v, a06.v, 0x88 ); // 2 18 34 50 ... + t03 = _mm512_shuffle_f32x4( a03.v, a07.v, 0x88 ); // 3 19 35 51 ... + t04 = _mm512_shuffle_f32x4( a00.v, a04.v, 0xdd ); // 4 20 36 52 ... + t05 = _mm512_shuffle_f32x4( a01.v, a05.v, 0xdd ); // 5 21 37 53 ... + t06 = _mm512_shuffle_f32x4( a02.v, a06.v, 0xdd ); // 6 22 38 54 ... + t07 = _mm512_shuffle_f32x4( a03.v, a07.v, 0xdd ); // 7 23 39 55 ... + t08 = _mm512_shuffle_f32x4( a08.v, a12.v, 0x88 ); // 128 144 160 176 ... + t09 = _mm512_shuffle_f32x4( a09.v, a13.v, 0x88 ); // 129 145 161 177 ... + t10 = _mm512_shuffle_f32x4( a10.v, a14.v, 0x88 ); // 130 146 162 178 ... + t11 = _mm512_shuffle_f32x4( a11.v, a15.v, 0x88 ); // 131 147 163 179 ... + t12 = _mm512_shuffle_f32x4( a08.v, a12.v, 0xdd ); // 132 148 164 180 ... + t13 = _mm512_shuffle_f32x4( a09.v, a13.v, 0xdd ); // 133 149 165 181 ... + t14 = _mm512_shuffle_f32x4( a10.v, a14.v, 0xdd ); // 134 150 166 182 ... + t15 = _mm512_shuffle_f32x4( a11.v, a15.v, 0xdd ); // 135 151 167 183 ... + + a00.v = _mm512_shuffle_f32x4( + t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 ... 240 + a01.v = _mm512_shuffle_f32x4( + t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 ... 241 + a02.v = _mm512_shuffle_f32x4( + t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 ... 242 + a03.v = _mm512_shuffle_f32x4( + t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 ... 243 a04.v = _mm512_shuffle_f32x4( t04, t12, 0x88 ); // 4 ... a05.v = _mm512_shuffle_f32x4( t05, t13, 0x88 ); // 5 ... a06.v = _mm512_shuffle_f32x4( t06, t14, 0x88 ); // 6 ... @@ -516,1193 +474,2032 @@ namespace v16 a12.v = _mm512_shuffle_f32x4( t04, t12, 0xdd ); // 12 ... a13.v = _mm512_shuffle_f32x4( t05, t13, 0xdd ); // 13 ... a14.v = _mm512_shuffle_f32x4( t06, t14, 0xdd ); // 14 ... - a15.v = _mm512_shuffle_f32x4( t07, t15, 0xdd ); // 15 31 47 63 79 96 111 127 ... 255 - } + a15.v = _mm512_shuffle_f32x4( + t07, t15, 0xdd ); // 15 31 47 63 79 96 111 127 ... 255 +} -# undef sw +#undef sw - // v16 memory manipulation functions +// v16 memory manipulation functions - // Portable version. - inline void load_16x1( const void * ALIGNED(64) p, - v16 &a ) - { - for( int j = 0; j < 16; j++ ) - a.i[j] = ( ( const int * ALIGNED(64) ) p )[j]; - } +// Portable version. +inline void load_16x1( const void* ALIGNED( 64 ) p, v16& a ) +{ + for ( int j = 0; j < 16; j++ ) + a.i[j] = ( (const int* ALIGNED( 64 ))p )[j]; +} - // Portable version. - inline void store_16x1( const v16 &a, - void * ALIGNED(64) p ) - { - for( int j = 0; j < 16; j++ ) - ( ( int * ALIGNED(64) ) p )[j] = a.i[j]; - } +// Portable version. +inline void store_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))p )[j] = a.i[j]; +} - inline void stream_16x1( const v16 &a, - void * ALIGNED(64) p ) - { - for( int j = 0; j < 16; j++ ) - ( ( int * ALIGNED(64) ) p )[j] = a.i[j]; - } +inline void stream_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))p )[j] = a.i[j]; +} - inline void clear_16x1( void * ALIGNED(64) p ) - { - for( int j = 0; j < 16; j++ ) - ( ( int * ALIGNED(64) ) p )[j] = 0; - } +inline void clear_16x1( void* ALIGNED( 64 ) p ) +{ + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))p )[j] = 0; +} - // FIXME: Ordering semantics - inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) - { - for( int j = 0; j < 16; j++ ) - ( ( int * ALIGNED(64) ) dst )[j] = ( ( const int * ALIGNED(64) ) src )[j]; - } +// FIXME: Ordering semantics +inline void copy_16x1( void* ALIGNED( 64 ) dst, const void* ALIGNED( 64 ) src ) +{ + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))dst )[j] = ( (const int* ALIGNED( 64 ))src )[j]; +} - inline void swap_16x1( void * ALIGNED(64) a, - void * ALIGNED(64) b ) - { +inline void swap_16x1( void* ALIGNED( 64 ) a, void* ALIGNED( 64 ) b ) +{ int t; - for( int j = 0; j < 16; j++ ) + for ( int j = 0; j < 16; j++ ) { - t = ( ( int * ALIGNED(64) ) a )[j]; - ( ( int * ALIGNED(64) ) a )[j] = ( ( int * ALIGNED(64) ) b )[j]; - ( ( int * ALIGNED(64) ) b )[j] = t; + t = ( (int* ALIGNED( 64 ))a )[j]; + ( (int* ALIGNED( 64 ))a )[j] = ( (int* ALIGNED( 64 ))b )[j]; + ( (int* ALIGNED( 64 ))b )[j] = t; } - } +} - // v16 transposed memory manipulation functions - - inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) - { - a.i[ 0] = ((const int *)a00)[0]; - a.i[ 1] = ((const int *)a01)[0]; - a.i[ 2] = ((const int *)a02)[0]; - a.i[ 3] = ((const int *)a03)[0]; - a.i[ 4] = ((const int *)a04)[0]; - a.i[ 5] = ((const int *)a05)[0]; - a.i[ 6] = ((const int *)a06)[0]; - a.i[ 7] = ((const int *)a07)[0]; - a.i[ 8] = ((const int *)a08)[0]; - a.i[ 9] = ((const int *)a09)[0]; - a.i[10] = ((const int *)a10)[0]; - a.i[11] = ((const int *)a11)[0]; - a.i[12] = ((const int *)a12)[0]; - a.i[13] = ((const int *)a13)[0]; - a.i[14] = ((const int *)a14)[0]; - a.i[15] = ((const int *)a15)[0]; - } +// v16 transposed memory manipulation functions - inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &b00, v16 &b01 ) - { - __m512 t00, t01, t02, t04, t06, t08, t09, t10, t12, t14; - __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15; - - u00 = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - u01 = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - u02 = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - u03 = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - u04 = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - u05 = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - u06 = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - u07 = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - u08 = _mm512_load_ps( (const float *)a08 ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - u09 = _mm512_load_ps( (const float *)a09 ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - u10 = _mm512_load_ps( (const float *)a10 ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - u11 = _mm512_load_ps( (const float *)a11 ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - u12 = _mm512_load_ps( (const float *)a12 ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - u13 = _mm512_load_ps( (const float *)a13 ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - u14 = _mm512_load_ps( (const float *)a14 ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - u15 = _mm512_load_ps( (const float *)a15 ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( u00, u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t02 = _mm512_unpacklo_ps( u02, u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t04 = _mm512_unpacklo_ps( u04, u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - - u00 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - u01 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - u04 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - u05 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - u08 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - u09 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - u12 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - u13 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - - t00 = _mm512_shuffle_f32x4( u00, u04, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( u01, u05, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t08 = _mm512_shuffle_f32x4( u08, u12, 0x88 ); // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248 - t09 = _mm512_shuffle_f32x4( u09, u13, 0x88 ); // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249 - - b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 - b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 129 145 161 177 193 209 225 241 - } - - inline void load_16x2_bc( const void * ALIGNED(64) a00, - v16 &b00, v16 &b01 ) - { +inline void load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) +{ + a.i[0] = ( (const int*)a00 )[0]; + a.i[1] = ( (const int*)a01 )[0]; + a.i[2] = ( (const int*)a02 )[0]; + a.i[3] = ( (const int*)a03 )[0]; + a.i[4] = ( (const int*)a04 )[0]; + a.i[5] = ( (const int*)a05 )[0]; + a.i[6] = ( (const int*)a06 )[0]; + a.i[7] = ( (const int*)a07 )[0]; + a.i[8] = ( (const int*)a08 )[0]; + a.i[9] = ( (const int*)a09 )[0]; + a.i[10] = ( (const int*)a10 )[0]; + a.i[11] = ( (const int*)a11 )[0]; + a.i[12] = ( (const int*)a12 )[0]; + a.i[13] = ( (const int*)a13 )[0]; + a.i[14] = ( (const int*)a14 )[0]; + a.i[15] = ( (const int*)a15 )[0]; +} + +inline void +load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& b00, v16& b01 ) +{ + __m512 t00, t01, t02, t04, t06, t08, t09, t10, t12, t14; + __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; + + u00 = _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + u01 = _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + u02 = _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + u03 = _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + u04 = _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + u05 = _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + u06 = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + u07 = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + u08 = + _mm512_load_ps( (const float*)a08 ); // 128 129 130 131 132 133 134 135 + // 136 137 138 139 140 141 142 143 + u09 = + _mm512_load_ps( (const float*)a09 ); // 144 145 146 147 148 149 150 151 + // 152 153 154 155 156 157 158 159 + u10 = + _mm512_load_ps( (const float*)a10 ); // 160 161 162 163 164 165 166 167 + // 168 169 170 171 172 173 174 175 + u11 = + _mm512_load_ps( (const float*)a11 ); // 176 177 178 179 180 181 182 183 + // 184 185 186 187 188 189 190 191 + u12 = + _mm512_load_ps( (const float*)a12 ); // 192 193 194 195 196 197 198 199 + // 200 201 202 203 204 205 206 207 + u13 = + _mm512_load_ps( (const float*)a13 ); // 208 209 210 211 212 213 214 215 + // 216 217 218 219 220 221 222 223 + u14 = + _mm512_load_ps( (const float*)a14 ); // 224 225 226 227 228 229 230 231 + // 232 233 234 235 236 237 238 239 + u15 = + _mm512_load_ps( (const float*)a15 ); // 240 241 242 243 244 245 246 247 + // 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( + u00, + u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 + t02 = _mm512_unpacklo_ps( + u02, + u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 + t04 = _mm512_unpacklo_ps( + u04, + u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 + t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 + // 120 105 121 108 124 109 125 + t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 + // 152 137 153 140 156 141 157 + t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 + // 184 169 185 172 188 173 189 + t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 + // 216 201 217 204 220 205 221 + t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 + // 248 233 249 236 252 237 253 + + u00 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + u01 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + u04 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + u05 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + u08 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 + // 168 184 140 156 172 188 + u09 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 + // 169 185 141 157 173 189 + u12 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 + // 232 248 204 220 236 252 + u13 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 + // 233 249 205 221 237 253 + + t00 = _mm512_shuffle_f32x4( u00, u04, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( u01, u05, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t08 = _mm512_shuffle_f32x4( u08, u12, + 0x88 ); // 128 144 160 176 136 152 168 184 192 + // 208 224 240 200 216 232 248 + t09 = _mm512_shuffle_f32x4( u09, u13, + 0x88 ); // 129 145 161 177 137 153 169 185 193 + // 209 225 241 201 217 233 249 + + b00.v = _mm512_shuffle_f32x4( t00, t08, + 0x88 ); // 0 16 32 48 64 80 96 112 128 + // 144 160 176 192 208 224 240 + b01.v = _mm512_shuffle_f32x4( t01, t09, + 0x88 ); // 1 17 33 49 66 81 97 113 129 + // 145 161 177 193 209 225 241 +} + +inline void load_16x2_bc( const void* ALIGNED( 64 ) a00, v16& b00, v16& b01 ) +{ __m512 t00; - t00 = _mm512_load_ps( (const float *)a00 ); + t00 = _mm512_load_ps( (const float*)a00 ); b00.v = _mm512_set1_ps( t00[0] ); b01.v = _mm512_set1_ps( t00[1] ); - } - - inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15; - - u00 = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - u01 = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - u02 = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - u03 = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - u04 = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - u05 = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - u06 = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - u07 = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - u08 = _mm512_load_ps( (const float *)a08 ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - u09 = _mm512_load_ps( (const float *)a09 ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - u10 = _mm512_load_ps( (const float *)a10 ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - u11 = _mm512_load_ps( (const float *)a11 ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - u12 = _mm512_load_ps( (const float *)a12 ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - u13 = _mm512_load_ps( (const float *)a13 ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - u14 = _mm512_load_ps( (const float *)a14 ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - u15 = _mm512_load_ps( (const float *)a15 ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( u00, u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( u00, u01 ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( u02, u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( u02, u03 ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( u04, u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( u04, u05 ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( u06, u07 ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( u08, u09 ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( u10, u11 ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( u12, u13 ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( u14, u15 ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - u00 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - u01 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - u02 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - u04 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - u05 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - u06 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - u08 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - u09 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - u10 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190 - u12 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - u13 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - u14 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254 - - t00 = _mm512_shuffle_f32x4( u00, u04, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( u01, u05, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_shuffle_f32x4( u02, u06, 0x88 ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t08 = _mm512_shuffle_f32x4( u08, u12, 0x88 ); // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248 - t09 = _mm512_shuffle_f32x4( u09, u13, 0x88 ); // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249 - t10 = _mm512_shuffle_f32x4( u10, u14, 0x88 ); // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250 - - b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 - b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 129 145 161 177 193 209 225 241 - b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 130 146 162 178 194 210 226 242 - } - - inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15; - - u00 = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - u01 = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - u02 = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - u03 = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - u04 = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - u05 = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - u06 = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - u07 = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - u08 = _mm512_load_ps( (const float *)a08 ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - u09 = _mm512_load_ps( (const float *)a09 ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - u10 = _mm512_load_ps( (const float *)a10 ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - u11 = _mm512_load_ps( (const float *)a11 ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - u12 = _mm512_load_ps( (const float *)a12 ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - u13 = _mm512_load_ps( (const float *)a13 ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - u14 = _mm512_load_ps( (const float *)a14 ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - u15 = _mm512_load_ps( (const float *)a15 ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( u00, u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( u00, u01 ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( u02, u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( u02, u03 ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( u04, u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( u04, u05 ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( u06, u07 ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( u08, u09 ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( u10, u11 ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( u12, u13 ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( u14, u15 ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - u00 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - u01 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - u02 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - u03 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - u04 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - u05 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - u06 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - u07 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - u08 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - u09 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - u10 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190 - u11 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191 - u12 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - u13 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - u14 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254 - u15 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255 - - t00 = _mm512_shuffle_f32x4( u00, u04, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( u01, u05, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_shuffle_f32x4( u02, u06, 0x88 ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_shuffle_f32x4( u03, u07, 0x88 ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t08 = _mm512_shuffle_f32x4( u08, u12, 0x88 ); // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248 - t09 = _mm512_shuffle_f32x4( u09, u13, 0x88 ); // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249 - t10 = _mm512_shuffle_f32x4( u10, u14, 0x88 ); // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250 - t11 = _mm512_shuffle_f32x4( u11, u15, 0x88 ); // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251 - - b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 - b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 129 145 161 177 193 209 225 241 - b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 130 146 162 178 194 210 226 242 - b03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 131 147 163 179 195 211 227 243 - } - - inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15; - - u00 = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - u01 = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - u02 = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - u03 = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - u04 = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - u05 = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - u06 = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - u07 = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - u08 = _mm512_load_ps( (const float *)a08 ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - u09 = _mm512_load_ps( (const float *)a09 ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - u10 = _mm512_load_ps( (const float *)a10 ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - u11 = _mm512_load_ps( (const float *)a11 ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - u12 = _mm512_load_ps( (const float *)a12 ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - u13 = _mm512_load_ps( (const float *)a13 ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - u14 = _mm512_load_ps( (const float *)a14 ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - u15 = _mm512_load_ps( (const float *)a15 ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( u00, u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( u00, u01 ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( u02, u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( u02, u03 ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( u04, u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( u04, u05 ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( u06, u07 ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( u08, u09 ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( u10, u11 ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( u12, u13 ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( u14, u15 ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - u00 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - u01 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - u02 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - u03 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - u04 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - u05 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - u06 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - u07 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - u08 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - u09 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - u10 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190 - u11 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191 - u12 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - u13 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - u14 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254 - u15 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255 - - t00 = _mm512_shuffle_f32x4( u00, u04, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( u01, u05, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_shuffle_f32x4( u02, u06, 0x88 ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_shuffle_f32x4( u03, u07, 0x88 ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t04 = _mm512_shuffle_f32x4( u00, u04, 0xdd ); // 4 20 36 52 12 28 44 60 68 84 100 116 76 92 108 124 - t05 = _mm512_shuffle_f32x4( u01, u05, 0xdd ); // 5 21 37 53 13 29 45 61 69 85 101 117 77 93 109 125 - t06 = _mm512_shuffle_f32x4( u02, u06, 0xdd ); // 6 22 38 54 14 30 46 62 70 86 102 118 78 94 110 126 - t07 = _mm512_shuffle_f32x4( u03, u07, 0xdd ); // 7 23 39 55 15 31 47 63 71 87 103 119 79 95 111 127 - t08 = _mm512_shuffle_f32x4( u08, u12, 0x88 ); // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248 - t09 = _mm512_shuffle_f32x4( u09, u13, 0x88 ); // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249 - t10 = _mm512_shuffle_f32x4( u10, u14, 0x88 ); // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250 - t11 = _mm512_shuffle_f32x4( u11, u15, 0x88 ); // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251 - t12 = _mm512_shuffle_f32x4( u08, u12, 0xdd ); // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252 - t13 = _mm512_shuffle_f32x4( u09, u13, 0xdd ); // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253 - t14 = _mm512_shuffle_f32x4( u10, u14, 0xdd ); // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254 - t15 = _mm512_shuffle_f32x4( u11, u15, 0xdd ); // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255 - - b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 - b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 129 145 161 177 193 209 225 241 - b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 130 146 162 178 194 210 226 242 - b03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 131 147 163 179 195 211 227 243 - b04.v = _mm512_shuffle_f32x4( t04, t12, 0x88 ); // 4 20 36 52 69 84 100 116 132 148 164 180 196 212 228 244 - b05.v = _mm512_shuffle_f32x4( t05, t13, 0x88 ); // 5 21 37 53 70 85 101 117 133 149 165 181 197 213 229 245 - b06.v = _mm512_shuffle_f32x4( t06, t14, 0x88 ); // 6 22 38 54 71 86 102 118 134 150 166 182 198 214 230 246 - b07.v = _mm512_shuffle_f32x4( t07, t15, 0x88 ); // 7 23 39 55 72 87 103 119 135 151 167 183 199 215 231 247 - } - - // This is the reference AVX-512 implementation. - inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - - b00.v = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - b01.v = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - b02.v = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - b03.v = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - b04.v = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - b05.v = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - b06.v = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - b07.v = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - b08.v = _mm512_load_ps( (const float *)a08 ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - b09.v = _mm512_load_ps( (const float *)a09 ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - b10.v = _mm512_load_ps( (const float *)a10 ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - b11.v = _mm512_load_ps( (const float *)a11 ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - b12.v = _mm512_load_ps( (const float *)a12 ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - b13.v = _mm512_load_ps( (const float *)a13 ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - b14.v = _mm512_load_ps( (const float *)a14 ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - b15.v = _mm512_load_ps( (const float *)a15 ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - b00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - b01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - b02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - b03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - b04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - b05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - b06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - b07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - b08.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - b09.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - b10.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190 - b11.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191 - b12.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - b13.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - b14.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254 - b15.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255 - - t00 = _mm512_shuffle_f32x4( b00.v, b04.v, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( b01.v, b05.v, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_shuffle_f32x4( b02.v, b06.v, 0x88 ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_shuffle_f32x4( b03.v, b07.v, 0x88 ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t04 = _mm512_shuffle_f32x4( b00.v, b04.v, 0xdd ); // 4 20 36 52 12 28 44 60 68 84 100 116 76 92 108 124 - t05 = _mm512_shuffle_f32x4( b01.v, b05.v, 0xdd ); // 5 21 37 53 13 29 45 61 69 85 101 117 77 93 109 125 - t06 = _mm512_shuffle_f32x4( b02.v, b06.v, 0xdd ); // 6 22 38 54 14 30 46 62 70 86 102 118 78 94 110 126 - t07 = _mm512_shuffle_f32x4( b03.v, b07.v, 0xdd ); // 7 23 39 55 15 31 47 63 71 87 103 119 79 95 111 127 - t08 = _mm512_shuffle_f32x4( b08.v, b12.v, 0x88 ); // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248 - t09 = _mm512_shuffle_f32x4( b09.v, b13.v, 0x88 ); // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249 - t10 = _mm512_shuffle_f32x4( b10.v, b14.v, 0x88 ); // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250 - t11 = _mm512_shuffle_f32x4( b11.v, b15.v, 0x88 ); // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251 - t12 = _mm512_shuffle_f32x4( b08.v, b12.v, 0xdd ); // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252 - t13 = _mm512_shuffle_f32x4( b09.v, b13.v, 0xdd ); // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253 - t14 = _mm512_shuffle_f32x4( b10.v, b14.v, 0xdd ); // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254 - t15 = _mm512_shuffle_f32x4( b11.v, b15.v, 0xdd ); // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255 - - b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 - b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 129 145 161 177 193 209 225 241 - b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 130 146 162 178 194 210 226 242 - b03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 131 147 163 179 195 211 227 243 - b04.v = _mm512_shuffle_f32x4( t04, t12, 0x88 ); // 4 20 36 52 69 84 100 116 132 148 164 180 196 212 228 244 - b05.v = _mm512_shuffle_f32x4( t05, t13, 0x88 ); // 5 21 37 53 70 85 101 117 133 149 165 181 197 213 229 245 - b06.v = _mm512_shuffle_f32x4( t06, t14, 0x88 ); // 6 22 38 54 71 86 102 118 134 150 166 182 198 214 230 246 - b07.v = _mm512_shuffle_f32x4( t07, t15, 0x88 ); // 7 23 39 55 72 87 103 119 135 151 167 183 199 215 231 247 - b08.v = _mm512_shuffle_f32x4( t00, t08, 0xdd ); // 8 24 40 56 73 88 104 120 136 152 168 184 200 216 232 248 - b09.v = _mm512_shuffle_f32x4( t01, t09, 0xdd ); // 9 25 41 57 74 89 105 121 137 153 169 185 201 217 233 249 - b10.v = _mm512_shuffle_f32x4( t02, t10, 0xdd ); // 10 26 42 58 75 90 106 122 138 154 170 186 202 218 234 250 - b11.v = _mm512_shuffle_f32x4( t03, t11, 0xdd ); // 11 27 43 59 76 91 107 123 139 155 171 187 203 219 235 251 - b12.v = _mm512_shuffle_f32x4( t04, t12, 0xdd ); // 12 28 44 60 77 92 108 124 140 156 172 188 204 220 236 252 - b13.v = _mm512_shuffle_f32x4( t05, t13, 0xdd ); // 13 29 45 61 78 93 109 125 141 157 173 189 205 221 237 253 - b14.v = _mm512_shuffle_f32x4( t06, t14, 0xdd ); // 14 30 46 62 79 94 110 126 142 158 174 190 206 222 238 254 - b15.v = _mm512_shuffle_f32x4( t07, t15, 0xdd ); // 15 31 47 63 79 95 111 127 143 159 175 191 207 223 239 255 - } - - // This is the reference AVX-512 implementation. - inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07 ) - { +} + +inline void +load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; + + u00 = _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + u01 = _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + u02 = _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + u03 = _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + u04 = _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + u05 = _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + u06 = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + u07 = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + u08 = + _mm512_load_ps( (const float*)a08 ); // 128 129 130 131 132 133 134 135 + // 136 137 138 139 140 141 142 143 + u09 = + _mm512_load_ps( (const float*)a09 ); // 144 145 146 147 148 149 150 151 + // 152 153 154 155 156 157 158 159 + u10 = + _mm512_load_ps( (const float*)a10 ); // 160 161 162 163 164 165 166 167 + // 168 169 170 171 172 173 174 175 + u11 = + _mm512_load_ps( (const float*)a11 ); // 176 177 178 179 180 181 182 183 + // 184 185 186 187 188 189 190 191 + u12 = + _mm512_load_ps( (const float*)a12 ); // 192 193 194 195 196 197 198 199 + // 200 201 202 203 204 205 206 207 + u13 = + _mm512_load_ps( (const float*)a13 ); // 208 209 210 211 212 213 214 215 + // 216 217 218 219 220 221 222 223 + u14 = + _mm512_load_ps( (const float*)a14 ); // 224 225 226 227 228 229 230 231 + // 232 233 234 235 236 237 238 239 + u15 = + _mm512_load_ps( (const float*)a15 ); // 240 241 242 243 244 245 246 247 + // 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( + u00, + u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( + u00, + u01 ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( + u02, + u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( + u02, + u03 ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( + u04, + u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( + u04, + u05 ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 + t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 + // 120 105 121 108 124 109 125 + t07 = _mm512_unpackhi_ps( u06, u07 ); // 98 114 99 115 102 118 103 119 106 + // 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 + // 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( u08, u09 ); // 130 146 131 147 134 150 135 151 138 + // 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 + // 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( u10, u11 ); // 162 178 163 179 166 182 167 183 170 + // 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 + // 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( u12, u13 ); // 194 210 195 211 198 214 199 215 202 + // 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 + // 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( u14, u15 ); // 226 242 227 243 230 246 231 247 234 + // 250 235 251 238 254 239 255 + + u00 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + u01 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + u02 = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 + // 42 58 14 30 46 62 + u04 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + u05 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + u06 = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 + // 106 122 78 94 110 126 + u08 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 + // 168 184 140 156 172 188 + u09 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 + // 169 185 141 157 173 189 + u10 = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 + // 170 186 142 158 174 190 + u12 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 + // 232 248 204 220 236 252 + u13 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 + // 233 249 205 221 237 253 + u14 = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 + // 234 250 206 222 238 254 + + t00 = _mm512_shuffle_f32x4( u00, u04, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( u01, u05, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t02 = _mm512_shuffle_f32x4( u02, u06, + 0x88 ); // 2 18 34 50 10 26 42 58 66 + // 82 98 114 74 90 106 122 + t08 = _mm512_shuffle_f32x4( u08, u12, + 0x88 ); // 128 144 160 176 136 152 168 184 192 + // 208 224 240 200 216 232 248 + t09 = _mm512_shuffle_f32x4( u09, u13, + 0x88 ); // 129 145 161 177 137 153 169 185 193 + // 209 225 241 201 217 233 249 + t10 = _mm512_shuffle_f32x4( u10, u14, + 0x88 ); // 130 146 162 178 138 154 170 186 194 + // 210 226 242 202 218 234 250 + + b00.v = _mm512_shuffle_f32x4( t00, t08, + 0x88 ); // 0 16 32 48 64 80 96 112 128 + // 144 160 176 192 208 224 240 + b01.v = _mm512_shuffle_f32x4( t01, t09, + 0x88 ); // 1 17 33 49 66 81 97 113 129 + // 145 161 177 193 209 225 241 + b02.v = _mm512_shuffle_f32x4( t02, t10, + 0x88 ); // 2 18 34 50 67 82 98 114 130 + // 146 162 178 194 210 226 242 +} + +inline void +load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; + + u00 = _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + u01 = _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + u02 = _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + u03 = _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + u04 = _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + u05 = _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + u06 = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + u07 = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + u08 = + _mm512_load_ps( (const float*)a08 ); // 128 129 130 131 132 133 134 135 + // 136 137 138 139 140 141 142 143 + u09 = + _mm512_load_ps( (const float*)a09 ); // 144 145 146 147 148 149 150 151 + // 152 153 154 155 156 157 158 159 + u10 = + _mm512_load_ps( (const float*)a10 ); // 160 161 162 163 164 165 166 167 + // 168 169 170 171 172 173 174 175 + u11 = + _mm512_load_ps( (const float*)a11 ); // 176 177 178 179 180 181 182 183 + // 184 185 186 187 188 189 190 191 + u12 = + _mm512_load_ps( (const float*)a12 ); // 192 193 194 195 196 197 198 199 + // 200 201 202 203 204 205 206 207 + u13 = + _mm512_load_ps( (const float*)a13 ); // 208 209 210 211 212 213 214 215 + // 216 217 218 219 220 221 222 223 + u14 = + _mm512_load_ps( (const float*)a14 ); // 224 225 226 227 228 229 230 231 + // 232 233 234 235 236 237 238 239 + u15 = + _mm512_load_ps( (const float*)a15 ); // 240 241 242 243 244 245 246 247 + // 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( + u00, + u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( + u00, + u01 ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( + u02, + u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( + u02, + u03 ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( + u04, + u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( + u04, + u05 ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 + t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 + // 120 105 121 108 124 109 125 + t07 = _mm512_unpackhi_ps( u06, u07 ); // 98 114 99 115 102 118 103 119 106 + // 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 + // 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( u08, u09 ); // 130 146 131 147 134 150 135 151 138 + // 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 + // 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( u10, u11 ); // 162 178 163 179 166 182 167 183 170 + // 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 + // 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( u12, u13 ); // 194 210 195 211 198 214 199 215 202 + // 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 + // 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( u14, u15 ); // 226 242 227 243 230 246 231 247 234 + // 250 235 251 238 254 239 255 + + u00 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + u01 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + u02 = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 + // 42 58 14 30 46 62 + u03 = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 + // 43 59 15 31 47 63 + u04 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + u05 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + u06 = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 + // 106 122 78 94 110 126 + u07 = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 + // 107 123 79 95 111 127 + u08 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 + // 168 184 140 156 172 188 + u09 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 + // 169 185 141 157 173 189 + u10 = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 + // 170 186 142 158 174 190 + u11 = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 + // 171 187 143 159 175 191 + u12 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 + // 232 248 204 220 236 252 + u13 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 + // 233 249 205 221 237 253 + u14 = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 + // 234 250 206 222 238 254 + u15 = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 + // 235 251 207 223 239 255 + + t00 = _mm512_shuffle_f32x4( u00, u04, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( u01, u05, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t02 = _mm512_shuffle_f32x4( u02, u06, + 0x88 ); // 2 18 34 50 10 26 42 58 66 + // 82 98 114 74 90 106 122 + t03 = _mm512_shuffle_f32x4( u03, u07, + 0x88 ); // 3 19 35 51 11 27 43 59 67 + // 83 99 115 75 91 107 123 + t08 = _mm512_shuffle_f32x4( u08, u12, + 0x88 ); // 128 144 160 176 136 152 168 184 192 + // 208 224 240 200 216 232 248 + t09 = _mm512_shuffle_f32x4( u09, u13, + 0x88 ); // 129 145 161 177 137 153 169 185 193 + // 209 225 241 201 217 233 249 + t10 = _mm512_shuffle_f32x4( u10, u14, + 0x88 ); // 130 146 162 178 138 154 170 186 194 + // 210 226 242 202 218 234 250 + t11 = _mm512_shuffle_f32x4( u11, u15, + 0x88 ); // 131 147 163 179 139 155 171 187 195 + // 211 227 243 203 219 235 251 + + b00.v = _mm512_shuffle_f32x4( t00, t08, + 0x88 ); // 0 16 32 48 64 80 96 112 128 + // 144 160 176 192 208 224 240 + b01.v = _mm512_shuffle_f32x4( t01, t09, + 0x88 ); // 1 17 33 49 66 81 97 113 129 + // 145 161 177 193 209 225 241 + b02.v = _mm512_shuffle_f32x4( t02, t10, + 0x88 ); // 2 18 34 50 67 82 98 114 130 + // 146 162 178 194 210 226 242 + b03.v = _mm512_shuffle_f32x4( t03, t11, + 0x88 ); // 3 19 35 51 68 83 99 115 131 + // 147 163 179 195 211 227 243 +} + +inline void +load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; + + u00 = _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + u01 = _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + u02 = _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + u03 = _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + u04 = _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + u05 = _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + u06 = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + u07 = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + u08 = + _mm512_load_ps( (const float*)a08 ); // 128 129 130 131 132 133 134 135 + // 136 137 138 139 140 141 142 143 + u09 = + _mm512_load_ps( (const float*)a09 ); // 144 145 146 147 148 149 150 151 + // 152 153 154 155 156 157 158 159 + u10 = + _mm512_load_ps( (const float*)a10 ); // 160 161 162 163 164 165 166 167 + // 168 169 170 171 172 173 174 175 + u11 = + _mm512_load_ps( (const float*)a11 ); // 176 177 178 179 180 181 182 183 + // 184 185 186 187 188 189 190 191 + u12 = + _mm512_load_ps( (const float*)a12 ); // 192 193 194 195 196 197 198 199 + // 200 201 202 203 204 205 206 207 + u13 = + _mm512_load_ps( (const float*)a13 ); // 208 209 210 211 212 213 214 215 + // 216 217 218 219 220 221 222 223 + u14 = + _mm512_load_ps( (const float*)a14 ); // 224 225 226 227 228 229 230 231 + // 232 233 234 235 236 237 238 239 + u15 = + _mm512_load_ps( (const float*)a15 ); // 240 241 242 243 244 245 246 247 + // 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( + u00, + u01 ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( + u00, + u01 ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( + u02, + u03 ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( + u02, + u03 ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( + u04, + u05 ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( + u04, + u05 ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 + t06 = _mm512_unpacklo_ps( u06, u07 ); // 96 112 97 113 100 116 101 117 104 + // 120 105 121 108 124 109 125 + t07 = _mm512_unpackhi_ps( u06, u07 ); // 98 114 99 115 102 118 103 119 106 + // 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( u08, u09 ); // 128 144 129 145 132 148 133 149 136 + // 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( u08, u09 ); // 130 146 131 147 134 150 135 151 138 + // 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( u10, u11 ); // 160 176 161 177 164 180 165 181 168 + // 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( u10, u11 ); // 162 178 163 179 166 182 167 183 170 + // 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( u12, u13 ); // 192 208 193 209 196 212 197 213 200 + // 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( u12, u13 ); // 194 210 195 211 198 214 199 215 202 + // 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( u14, u15 ); // 224 240 225 241 228 244 229 245 232 + // 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( u14, u15 ); // 226 242 227 243 230 246 231 247 234 + // 250 235 251 238 254 239 255 + + u00 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + u01 = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + u02 = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 + // 42 58 14 30 46 62 + u03 = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 + // 43 59 15 31 47 63 + u04 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + u05 = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + u06 = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 + // 106 122 78 94 110 126 + u07 = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 + // 107 123 79 95 111 127 + u08 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 + // 168 184 140 156 172 188 + u09 = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 + // 169 185 141 157 173 189 + u10 = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 + // 170 186 142 158 174 190 + u11 = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 + // 171 187 143 159 175 191 + u12 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 + // 232 248 204 220 236 252 + u13 = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 + // 233 249 205 221 237 253 + u14 = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 + // 234 250 206 222 238 254 + u15 = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 + // 235 251 207 223 239 255 + + t00 = _mm512_shuffle_f32x4( u00, u04, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( u01, u05, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t02 = _mm512_shuffle_f32x4( u02, u06, + 0x88 ); // 2 18 34 50 10 26 42 58 66 + // 82 98 114 74 90 106 122 + t03 = _mm512_shuffle_f32x4( u03, u07, + 0x88 ); // 3 19 35 51 11 27 43 59 67 + // 83 99 115 75 91 107 123 + t04 = _mm512_shuffle_f32x4( u00, u04, + 0xdd ); // 4 20 36 52 12 28 44 60 68 + // 84 100 116 76 92 108 124 + t05 = _mm512_shuffle_f32x4( u01, u05, + 0xdd ); // 5 21 37 53 13 29 45 61 69 + // 85 101 117 77 93 109 125 + t06 = _mm512_shuffle_f32x4( u02, u06, + 0xdd ); // 6 22 38 54 14 30 46 62 70 + // 86 102 118 78 94 110 126 + t07 = _mm512_shuffle_f32x4( u03, u07, + 0xdd ); // 7 23 39 55 15 31 47 63 71 + // 87 103 119 79 95 111 127 + t08 = _mm512_shuffle_f32x4( u08, u12, + 0x88 ); // 128 144 160 176 136 152 168 184 192 + // 208 224 240 200 216 232 248 + t09 = _mm512_shuffle_f32x4( u09, u13, + 0x88 ); // 129 145 161 177 137 153 169 185 193 + // 209 225 241 201 217 233 249 + t10 = _mm512_shuffle_f32x4( u10, u14, + 0x88 ); // 130 146 162 178 138 154 170 186 194 + // 210 226 242 202 218 234 250 + t11 = _mm512_shuffle_f32x4( u11, u15, + 0x88 ); // 131 147 163 179 139 155 171 187 195 + // 211 227 243 203 219 235 251 + t12 = _mm512_shuffle_f32x4( u08, u12, + 0xdd ); // 132 148 164 180 140 156 172 188 196 + // 212 228 244 204 220 236 252 + t13 = _mm512_shuffle_f32x4( u09, u13, + 0xdd ); // 133 149 165 181 141 157 173 189 197 + // 213 229 245 205 221 237 253 + t14 = _mm512_shuffle_f32x4( u10, u14, + 0xdd ); // 134 150 166 182 142 158 174 190 198 + // 214 230 246 206 222 238 254 + t15 = _mm512_shuffle_f32x4( u11, u15, + 0xdd ); // 135 151 167 183 143 159 175 191 199 + // 215 231 247 207 223 239 255 + + b00.v = _mm512_shuffle_f32x4( t00, t08, + 0x88 ); // 0 16 32 48 64 80 96 112 128 + // 144 160 176 192 208 224 240 + b01.v = _mm512_shuffle_f32x4( t01, t09, + 0x88 ); // 1 17 33 49 66 81 97 113 129 + // 145 161 177 193 209 225 241 + b02.v = _mm512_shuffle_f32x4( t02, t10, + 0x88 ); // 2 18 34 50 67 82 98 114 130 + // 146 162 178 194 210 226 242 + b03.v = _mm512_shuffle_f32x4( t03, t11, + 0x88 ); // 3 19 35 51 68 83 99 115 131 + // 147 163 179 195 211 227 243 + b04.v = _mm512_shuffle_f32x4( t04, t12, + 0x88 ); // 4 20 36 52 69 84 100 116 132 + // 148 164 180 196 212 228 244 + b05.v = _mm512_shuffle_f32x4( t05, t13, + 0x88 ); // 5 21 37 53 70 85 101 117 133 + // 149 165 181 197 213 229 245 + b06.v = _mm512_shuffle_f32x4( t06, t14, + 0x88 ); // 6 22 38 54 71 86 102 118 134 + // 150 166 182 198 214 230 246 + b07.v = _mm512_shuffle_f32x4( t07, t15, + 0x88 ); // 7 23 39 55 72 87 103 119 135 + // 151 167 183 199 215 231 247 +} + +// This is the reference AVX-512 implementation. +inline void +load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + + b00.v = + _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + b01.v = + _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + b02.v = + _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + b03.v = + _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + b04.v = + _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + b05.v = + _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + b06.v = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + b07.v = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + b08.v = + _mm512_load_ps( (const float*)a08 ); // 128 129 130 131 132 133 134 135 + // 136 137 138 139 140 141 142 143 + b09.v = + _mm512_load_ps( (const float*)a09 ); // 144 145 146 147 148 149 150 151 + // 152 153 154 155 156 157 158 159 + b10.v = + _mm512_load_ps( (const float*)a10 ); // 160 161 162 163 164 165 166 167 + // 168 169 170 171 172 173 174 175 + b11.v = + _mm512_load_ps( (const float*)a11 ); // 176 177 178 179 180 181 182 183 + // 184 185 186 187 188 189 190 191 + b12.v = + _mm512_load_ps( (const float*)a12 ); // 192 193 194 195 196 197 198 199 + // 200 201 202 203 204 205 206 207 + b13.v = + _mm512_load_ps( (const float*)a13 ); // 208 209 210 211 212 213 214 215 + // 216 217 218 219 220 221 222 223 + b14.v = + _mm512_load_ps( (const float*)a14 ); // 224 225 226 227 228 229 230 231 + // 232 233 234 235 236 237 238 239 + b15.v = + _mm512_load_ps( (const float*)a15 ); // 240 241 242 243 244 245 246 247 + // 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 + // 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = + _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 + // 104 120 105 121 108 124 109 125 + t07 = + _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 + // 106 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 + // 136 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 + // 138 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 + // 168 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 + // 170 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 + // 200 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 + // 202 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 + // 232 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 + // 234 250 235 251 238 254 239 255 + + b00.v = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + b01.v = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + b02.v = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 + // 42 58 14 30 46 62 + b03.v = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 + // 43 59 15 31 47 63 + b04.v = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + b05.v = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + b06.v = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 + // 106 122 78 94 110 126 + b07.v = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 + // 107 123 79 95 111 127 + b08.v = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 + // 168 184 140 156 172 188 + b09.v = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 + // 169 185 141 157 173 189 + b10.v = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 + // 170 186 142 158 174 190 + b11.v = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 + // 171 187 143 159 175 191 + b12.v = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 + // 232 248 204 220 236 252 + b13.v = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 + // 233 249 205 221 237 253 + b14.v = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 + // 234 250 206 222 238 254 + b15.v = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 + // 235 251 207 223 239 255 + + t00 = _mm512_shuffle_f32x4( b00.v, b04.v, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( b01.v, b05.v, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t02 = _mm512_shuffle_f32x4( b02.v, b06.v, + 0x88 ); // 2 18 34 50 10 26 42 58 66 + // 82 98 114 74 90 106 122 + t03 = _mm512_shuffle_f32x4( b03.v, b07.v, + 0x88 ); // 3 19 35 51 11 27 43 59 67 + // 83 99 115 75 91 107 123 + t04 = _mm512_shuffle_f32x4( b00.v, b04.v, + 0xdd ); // 4 20 36 52 12 28 44 60 68 + // 84 100 116 76 92 108 124 + t05 = _mm512_shuffle_f32x4( b01.v, b05.v, + 0xdd ); // 5 21 37 53 13 29 45 61 69 + // 85 101 117 77 93 109 125 + t06 = _mm512_shuffle_f32x4( b02.v, b06.v, + 0xdd ); // 6 22 38 54 14 30 46 62 70 + // 86 102 118 78 94 110 126 + t07 = _mm512_shuffle_f32x4( b03.v, b07.v, + 0xdd ); // 7 23 39 55 15 31 47 63 71 + // 87 103 119 79 95 111 127 + t08 = _mm512_shuffle_f32x4( b08.v, b12.v, + 0x88 ); // 128 144 160 176 136 152 168 184 192 + // 208 224 240 200 216 232 248 + t09 = _mm512_shuffle_f32x4( b09.v, b13.v, + 0x88 ); // 129 145 161 177 137 153 169 185 193 + // 209 225 241 201 217 233 249 + t10 = _mm512_shuffle_f32x4( b10.v, b14.v, + 0x88 ); // 130 146 162 178 138 154 170 186 194 + // 210 226 242 202 218 234 250 + t11 = _mm512_shuffle_f32x4( b11.v, b15.v, + 0x88 ); // 131 147 163 179 139 155 171 187 195 + // 211 227 243 203 219 235 251 + t12 = _mm512_shuffle_f32x4( b08.v, b12.v, + 0xdd ); // 132 148 164 180 140 156 172 188 196 + // 212 228 244 204 220 236 252 + t13 = _mm512_shuffle_f32x4( b09.v, b13.v, + 0xdd ); // 133 149 165 181 141 157 173 189 197 + // 213 229 245 205 221 237 253 + t14 = _mm512_shuffle_f32x4( b10.v, b14.v, + 0xdd ); // 134 150 166 182 142 158 174 190 198 + // 214 230 246 206 222 238 254 + t15 = _mm512_shuffle_f32x4( b11.v, b15.v, + 0xdd ); // 135 151 167 183 143 159 175 191 199 + // 215 231 247 207 223 239 255 + + b00.v = _mm512_shuffle_f32x4( t00, t08, + 0x88 ); // 0 16 32 48 64 80 96 112 128 + // 144 160 176 192 208 224 240 + b01.v = _mm512_shuffle_f32x4( t01, t09, + 0x88 ); // 1 17 33 49 66 81 97 113 129 + // 145 161 177 193 209 225 241 + b02.v = _mm512_shuffle_f32x4( t02, t10, + 0x88 ); // 2 18 34 50 67 82 98 114 130 + // 146 162 178 194 210 226 242 + b03.v = _mm512_shuffle_f32x4( t03, t11, + 0x88 ); // 3 19 35 51 68 83 99 115 131 + // 147 163 179 195 211 227 243 + b04.v = _mm512_shuffle_f32x4( t04, t12, + 0x88 ); // 4 20 36 52 69 84 100 116 132 + // 148 164 180 196 212 228 244 + b05.v = _mm512_shuffle_f32x4( t05, t13, + 0x88 ); // 5 21 37 53 70 85 101 117 133 + // 149 165 181 197 213 229 245 + b06.v = _mm512_shuffle_f32x4( t06, t14, + 0x88 ); // 6 22 38 54 71 86 102 118 134 + // 150 166 182 198 214 230 246 + b07.v = _mm512_shuffle_f32x4( t07, t15, + 0x88 ); // 7 23 39 55 72 87 103 119 135 + // 151 167 183 199 215 231 247 + b08.v = _mm512_shuffle_f32x4( t00, t08, + 0xdd ); // 8 24 40 56 73 88 104 120 136 + // 152 168 184 200 216 232 248 + b09.v = _mm512_shuffle_f32x4( t01, t09, + 0xdd ); // 9 25 41 57 74 89 105 121 137 + // 153 169 185 201 217 233 249 + b10.v = _mm512_shuffle_f32x4( t02, t10, + 0xdd ); // 10 26 42 58 75 90 106 122 138 + // 154 170 186 202 218 234 250 + b11.v = _mm512_shuffle_f32x4( t03, t11, + 0xdd ); // 11 27 43 59 76 91 107 123 139 + // 155 171 187 203 219 235 251 + b12.v = _mm512_shuffle_f32x4( t04, t12, + 0xdd ); // 12 28 44 60 77 92 108 124 140 + // 156 172 188 204 220 236 252 + b13.v = _mm512_shuffle_f32x4( t05, t13, + 0xdd ); // 13 29 45 61 78 93 109 125 141 + // 157 173 189 205 221 237 253 + b14.v = _mm512_shuffle_f32x4( t06, t14, + 0xdd ); // 14 30 46 62 79 94 110 126 142 + // 158 174 190 206 222 238 254 + b15.v = _mm512_shuffle_f32x4( t07, t15, + 0xdd ); // 15 31 47 63 79 95 111 127 143 + // 159 175 191 207 223 239 255 +} + +// This is the reference AVX-512 implementation. +inline void +load_16x8_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07 ) +{ __m512 t00, t01, t02, t03, t04, t05, t06, t07; - __m512i idx = _mm512_set_epi32( 15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0 ); - - b00.v = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - b01.v = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - b02.v = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - b03.v = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - b04.v = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - b05.v = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - b06.v = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - b07.v = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - - t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - - b00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - b01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - b02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - b03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - b04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - b05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - b06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - b07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - - t00 = _mm512_shuffle_f32x4( b00.v, b04.v, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( b01.v, b05.v, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_shuffle_f32x4( b02.v, b06.v, 0x88 ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_shuffle_f32x4( b03.v, b07.v, 0x88 ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t04 = _mm512_shuffle_f32x4( b00.v, b04.v, 0xdd ); // 4 20 36 52 12 28 44 60 68 84 100 116 76 92 108 124 - t05 = _mm512_shuffle_f32x4( b01.v, b05.v, 0xdd ); // 5 21 37 53 13 29 45 61 69 85 101 117 77 93 109 125 - t06 = _mm512_shuffle_f32x4( b02.v, b06.v, 0xdd ); // 6 22 38 54 14 30 46 62 70 86 102 118 78 94 110 126 - t07 = _mm512_shuffle_f32x4( b03.v, b07.v, 0xdd ); // 7 23 39 55 15 31 47 63 71 87 103 119 79 95 111 127 - - b00.v = _mm512_permutexvar_ps( idx, t00 ); // 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 - b01.v = _mm512_permutexvar_ps( idx, t01 ); // 1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121 - b02.v = _mm512_permutexvar_ps( idx, t02 ); // 2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122 - b03.v = _mm512_permutexvar_ps( idx, t03 ); // 3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123 - b04.v = _mm512_permutexvar_ps( idx, t04 ); // 4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124 - b05.v = _mm512_permutexvar_ps( idx, t05 ); // 5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125 - b06.v = _mm512_permutexvar_ps( idx, t06 ); // 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126 - b07.v = _mm512_permutexvar_ps( idx, t07 ); // 7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127 - } - - // This is the reference AVX-512 implementation. - inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - - __m512i idx = _mm512_set_epi32( 15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0 ); - - b00.v = _mm512_load_ps( (const float *)a00 ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - b01.v = _mm512_load_ps( (const float *)a01 ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - b02.v = _mm512_load_ps( (const float *)a02 ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - b03.v = _mm512_load_ps( (const float *)a03 ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - b04.v = _mm512_load_ps( (const float *)a04 ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - b05.v = _mm512_load_ps( (const float *)a05 ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - b06.v = _mm512_load_ps( (const float *)a06 ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - b07.v = _mm512_load_ps( (const float *)a07 ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - b08.v = _mm512_load_ps( (const float *)a08 ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - b09.v = _mm512_load_ps( (const float *)a09 ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - b10.v = _mm512_load_ps( (const float *)a10 ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - b11.v = _mm512_load_ps( (const float *)a11 ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - b12.v = _mm512_load_ps( (const float *)a12 ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - b13.v = _mm512_load_ps( (const float *)a13 ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - b14.v = _mm512_load_ps( (const float *)a14 ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - b15.v = _mm512_load_ps( (const float *)a15 ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - b00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - b01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - b02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - b03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - b04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - b05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - b06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - b07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - b08.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - b09.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - b10.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190 - b11.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191 - b12.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - b13.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - b14.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254 - b15.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255 - - t00 = _mm512_shuffle_f32x4( b00.v, b04.v, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_shuffle_f32x4( b01.v, b05.v, 0x88 ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_shuffle_f32x4( b02.v, b06.v, 0x88 ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_shuffle_f32x4( b03.v, b07.v, 0x88 ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t04 = _mm512_shuffle_f32x4( b00.v, b04.v, 0xdd ); // 4 20 36 52 12 28 44 60 68 84 100 116 76 92 108 124 - t05 = _mm512_shuffle_f32x4( b01.v, b05.v, 0xdd ); // 5 21 37 53 13 29 45 61 69 85 101 117 77 93 109 125 - t06 = _mm512_shuffle_f32x4( b02.v, b06.v, 0xdd ); // 6 22 38 54 14 30 46 62 70 86 102 118 78 94 110 126 - t07 = _mm512_shuffle_f32x4( b03.v, b07.v, 0xdd ); // 7 23 39 55 15 31 47 63 71 87 103 119 79 95 111 127 - t08 = _mm512_shuffle_f32x4( b08.v, b12.v, 0x88 ); // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248 - t09 = _mm512_shuffle_f32x4( b09.v, b13.v, 0x88 ); // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249 - t10 = _mm512_shuffle_f32x4( b10.v, b14.v, 0x88 ); // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250 - t11 = _mm512_shuffle_f32x4( b11.v, b15.v, 0x88 ); // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251 - t12 = _mm512_shuffle_f32x4( b08.v, b12.v, 0xdd ); // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252 - t13 = _mm512_shuffle_f32x4( b09.v, b13.v, 0xdd ); // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253 - t14 = _mm512_shuffle_f32x4( b10.v, b14.v, 0xdd ); // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254 - t15 = _mm512_shuffle_f32x4( b11.v, b15.v, 0xdd ); // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255 - - b00.v = _mm512_permutexvar_ps( idx, t00 ); // 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 - b01.v = _mm512_permutexvar_ps( idx, t01 ); // 1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121 - b02.v = _mm512_permutexvar_ps( idx, t02 ); // 2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122 - b03.v = _mm512_permutexvar_ps( idx, t03 ); // 3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123 - b04.v = _mm512_permutexvar_ps( idx, t04 ); // 4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124 - b05.v = _mm512_permutexvar_ps( idx, t05 ); // 5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125 - b06.v = _mm512_permutexvar_ps( idx, t06 ); // 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126 - b07.v = _mm512_permutexvar_ps( idx, t07 ); // 7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127 - b08.v = _mm512_permutexvar_ps( idx, t08 ); // 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 - b09.v = _mm512_permutexvar_ps( idx, t09 ); // 129 137 145 153 161 169 177 185 193 201 209 217 225 233 241 249 - b10.v = _mm512_permutexvar_ps( idx, t10 ); // 130 138 146 154 162 170 178 186 194 202 210 218 226 234 242 250 - b11.v = _mm512_permutexvar_ps( idx, t11 ); // 131 139 147 155 163 171 179 187 195 203 211 219 227 235 243 251 - b12.v = _mm512_permutexvar_ps( idx, t12 ); // 132 140 148 156 164 172 180 188 196 204 212 220 228 236 244 252 - b13.v = _mm512_permutexvar_ps( idx, t13 ); // 133 141 149 157 165 173 181 189 197 205 213 221 229 237 245 253 - b14.v = _mm512_permutexvar_ps( idx, t14 ); // 134 142 150 158 166 174 182 190 198 206 214 222 230 238 246 254 - b15.v = _mm512_permutexvar_ps( idx, t15 ); // 135 143 151 159 167 175 183 191 199 207 215 223 231 239 247 255 - } - - inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) - { - ((int *)a00)[0] = a.i[ 0]; - ((int *)a01)[0] = a.i[ 1]; - ((int *)a02)[0] = a.i[ 2]; - ((int *)a03)[0] = a.i[ 3]; - ((int *)a04)[0] = a.i[ 4]; - ((int *)a05)[0] = a.i[ 5]; - ((int *)a06)[0] = a.i[ 6]; - ((int *)a07)[0] = a.i[ 7]; - ((int *)a08)[0] = a.i[ 8]; - ((int *)a09)[0] = a.i[ 9]; - ((int *)a10)[0] = a.i[10]; - ((int *)a11)[0] = a.i[11]; - ((int *)a12)[0] = a.i[12]; - ((int *)a13)[0] = a.i[13]; - ((int *)a14)[0] = a.i[14]; - ((int *)a15)[0] = a.i[15]; - } - - inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, void * ALIGNED(8) a01, - void * ALIGNED(8) a02, void * ALIGNED(8) a03, - void * ALIGNED(8) a04, void * ALIGNED(8) a05, - void * ALIGNED(8) a06, void * ALIGNED(8) a07, - void * ALIGNED(8) a08, void * ALIGNED(8) a09, - void * ALIGNED(8) a10, void * ALIGNED(8) a11, - void * ALIGNED(8) a12, void * ALIGNED(8) a13, - void * ALIGNED(8) a14, void * ALIGNED(8) a15 ) - { - ((int * ALIGNED(8))a00)[0] = a.i[ 0]; - ((int * ALIGNED(8))a00)[1] = b.i[ 0]; - - ((int * ALIGNED(8))a01)[0] = a.i[ 1]; - ((int * ALIGNED(8))a01)[1] = b.i[ 1]; - - ((int * ALIGNED(8))a02)[0] = a.i[ 2]; - ((int * ALIGNED(8))a02)[1] = b.i[ 2]; - - ((int * ALIGNED(8))a03)[0] = a.i[ 3]; - ((int * ALIGNED(8))a03)[1] = b.i[ 3]; - - ((int * ALIGNED(8))a04)[0] = a.i[ 4]; - ((int * ALIGNED(8))a04)[1] = b.i[ 4]; - - ((int * ALIGNED(8))a05)[0] = a.i[ 5]; - ((int * ALIGNED(8))a05)[1] = b.i[ 5]; - - ((int * ALIGNED(8))a06)[0] = a.i[ 6]; - ((int * ALIGNED(8))a06)[1] = b.i[ 6]; - - ((int * ALIGNED(8))a07)[0] = a.i[ 7]; - ((int * ALIGNED(8))a07)[1] = b.i[ 7]; - - ((int * ALIGNED(8))a08)[0] = a.i[ 8]; - ((int * ALIGNED(8))a08)[1] = b.i[ 8]; - - ((int * ALIGNED(8))a09)[0] = a.i[ 9]; - ((int * ALIGNED(8))a09)[1] = b.i[ 9]; - - ((int * ALIGNED(8))a10)[0] = a.i[10]; - ((int * ALIGNED(8))a10)[1] = b.i[10]; - - ((int * ALIGNED(8))a11)[0] = a.i[11]; - ((int * ALIGNED(8))a11)[1] = b.i[11]; - - ((int * ALIGNED(8))a12)[0] = a.i[12]; - ((int * ALIGNED(8))a12)[1] = b.i[12]; - - ((int * ALIGNED(8))a13)[0] = a.i[13]; - ((int * ALIGNED(8))a13)[1] = b.i[13]; - - ((int * ALIGNED(8))a14)[0] = a.i[14]; - ((int * ALIGNED(8))a14)[1] = b.i[14]; - - ((int * ALIGNED(8))a15)[0] = a.i[15]; - ((int * ALIGNED(8))a15)[1] = b.i[15]; - } - - inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; + __m512i idx = _mm512_set_epi32( 15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, + 1, 4, 0 ); + + b00.v = + _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + b01.v = + _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + b02.v = + _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + b03.v = + _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + b04.v = + _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + b05.v = + _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + b06.v = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + b07.v = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + + t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 + // 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = + _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 + // 104 120 105 121 108 124 109 125 + t07 = + _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 + // 106 122 107 123 110 126 111 127 + + b00.v = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + b01.v = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + b02.v = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 + // 42 58 14 30 46 62 + b03.v = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 + // 43 59 15 31 47 63 + b04.v = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + b05.v = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + b06.v = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 + // 106 122 78 94 110 126 + b07.v = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 + // 107 123 79 95 111 127 + + t00 = _mm512_shuffle_f32x4( b00.v, b04.v, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( b01.v, b05.v, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t02 = _mm512_shuffle_f32x4( b02.v, b06.v, + 0x88 ); // 2 18 34 50 10 26 42 58 66 + // 82 98 114 74 90 106 122 + t03 = _mm512_shuffle_f32x4( b03.v, b07.v, + 0x88 ); // 3 19 35 51 11 27 43 59 67 + // 83 99 115 75 91 107 123 + t04 = _mm512_shuffle_f32x4( b00.v, b04.v, + 0xdd ); // 4 20 36 52 12 28 44 60 68 + // 84 100 116 76 92 108 124 + t05 = _mm512_shuffle_f32x4( b01.v, b05.v, + 0xdd ); // 5 21 37 53 13 29 45 61 69 + // 85 101 117 77 93 109 125 + t06 = _mm512_shuffle_f32x4( b02.v, b06.v, + 0xdd ); // 6 22 38 54 14 30 46 62 70 + // 86 102 118 78 94 110 126 + t07 = _mm512_shuffle_f32x4( b03.v, b07.v, + 0xdd ); // 7 23 39 55 15 31 47 63 71 + // 87 103 119 79 95 111 127 + + b00.v = + _mm512_permutexvar_ps( idx, t00 ); // 0 8 16 24 32 40 48 56 64 + // 72 80 88 96 104 112 120 + b01.v = + _mm512_permutexvar_ps( idx, t01 ); // 1 9 17 25 33 41 49 57 65 + // 73 81 89 97 105 113 121 + b02.v = + _mm512_permutexvar_ps( idx, t02 ); // 2 10 18 26 34 42 50 58 66 + // 74 82 90 98 106 114 122 + b03.v = + _mm512_permutexvar_ps( idx, t03 ); // 3 11 19 27 35 43 51 59 67 + // 75 83 91 99 107 115 123 + b04.v = + _mm512_permutexvar_ps( idx, t04 ); // 4 12 20 28 36 44 52 60 68 + // 76 84 92 100 108 116 124 + b05.v = + _mm512_permutexvar_ps( idx, t05 ); // 5 13 21 29 37 45 53 61 69 + // 77 85 93 101 109 117 125 + b06.v = + _mm512_permutexvar_ps( idx, t06 ); // 6 14 22 30 38 46 54 62 70 + // 78 86 94 102 110 118 126 + b07.v = + _mm512_permutexvar_ps( idx, t07 ); // 7 15 23 31 39 47 55 63 71 + // 79 87 95 103 111 119 127 +} + +// This is the reference AVX-512 implementation. +inline void +load_16x16_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + + __m512i idx = _mm512_set_epi32( 15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, + 1, 4, 0 ); + + b00.v = + _mm512_load_ps( (const float*)a00 ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + b01.v = + _mm512_load_ps( (const float*)a01 ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + b02.v = + _mm512_load_ps( (const float*)a02 ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + b03.v = + _mm512_load_ps( (const float*)a03 ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + b04.v = + _mm512_load_ps( (const float*)a04 ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + b05.v = + _mm512_load_ps( (const float*)a05 ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + b06.v = + _mm512_load_ps( (const float*)a06 ); // 96 97 98 99 100 101 102 103 + // 104 105 106 107 108 109 110 111 + b07.v = + _mm512_load_ps( (const float*)a07 ); // 112 113 114 115 116 117 118 119 + // 120 121 122 123 124 125 126 127 + b08.v = + _mm512_load_ps( (const float*)a08 ); // 128 129 130 131 132 133 134 135 + // 136 137 138 139 140 141 142 143 + b09.v = + _mm512_load_ps( (const float*)a09 ); // 144 145 146 147 148 149 150 151 + // 152 153 154 155 156 157 158 159 + b10.v = + _mm512_load_ps( (const float*)a10 ); // 160 161 162 163 164 165 166 167 + // 168 169 170 171 172 173 174 175 + b11.v = + _mm512_load_ps( (const float*)a11 ); // 176 177 178 179 180 181 182 183 + // 184 185 186 187 188 189 190 191 + b12.v = + _mm512_load_ps( (const float*)a12 ); // 192 193 194 195 196 197 198 199 + // 200 201 202 203 204 205 206 207 + b13.v = + _mm512_load_ps( (const float*)a13 ); // 208 209 210 211 212 213 214 215 + // 216 217 218 219 220 221 222 223 + b14.v = + _mm512_load_ps( (const float*)a14 ); // 224 225 226 227 228 229 230 231 + // 232 233 234 235 236 237 238 239 + b15.v = + _mm512_load_ps( (const float*)a15 ); // 240 241 242 243 244 245 246 247 + // 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 + // 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = + _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 + // 104 120 105 121 108 124 109 125 + t07 = + _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 + // 106 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 + // 136 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 + // 138 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 + // 168 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 + // 170 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 + // 200 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 + // 202 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 + // 232 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 + // 234 250 235 251 238 254 239 255 + + b00.v = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 4 20 36 52 + // 8 24 40 56 12 28 44 60 + b01.v = _mm512_shuffle_ps( + t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 5 21 37 53 + // 9 25 41 57 13 29 45 61 + b02.v = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 6 22 38 54 10 26 + // 42 58 14 30 46 62 + b03.v = _mm512_shuffle_ps( + t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 7 23 39 55 11 27 + // 43 59 15 31 47 63 + b04.v = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 68 84 100 116 72 88 + // 104 120 76 92 108 124 + b05.v = _mm512_shuffle_ps( + t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 69 85 101 117 73 89 + // 105 121 77 93 109 125 + b06.v = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 70 86 102 118 74 90 + // 106 122 78 94 110 126 + b07.v = _mm512_shuffle_ps( + t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 71 87 103 119 75 91 + // 107 123 79 95 111 127 + b08.v = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 132 148 164 180 136 152 + // 168 184 140 156 172 188 + b09.v = _mm512_shuffle_ps( + t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 133 149 165 181 137 153 + // 169 185 141 157 173 189 + b10.v = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 134 150 166 182 138 154 + // 170 186 142 158 174 190 + b11.v = _mm512_shuffle_ps( + t09, t11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 135 151 167 183 139 155 + // 171 187 143 159 175 191 + b12.v = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 224 240 196 212 228 244 200 216 + // 232 248 204 220 236 252 + b13.v = _mm512_shuffle_ps( + t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 225 241 197 213 229 245 201 217 + // 233 249 205 221 237 253 + b14.v = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 226 242 198 214 230 246 202 218 + // 234 250 206 222 238 254 + b15.v = _mm512_shuffle_ps( + t13, t15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 227 243 199 215 231 247 203 219 + // 235 251 207 223 239 255 + + t00 = _mm512_shuffle_f32x4( b00.v, b04.v, + 0x88 ); // 0 16 32 48 8 24 40 56 64 + // 80 96 112 72 88 104 120 + t01 = _mm512_shuffle_f32x4( b01.v, b05.v, + 0x88 ); // 1 17 33 49 9 25 41 57 65 + // 81 97 113 73 89 105 121 + t02 = _mm512_shuffle_f32x4( b02.v, b06.v, + 0x88 ); // 2 18 34 50 10 26 42 58 66 + // 82 98 114 74 90 106 122 + t03 = _mm512_shuffle_f32x4( b03.v, b07.v, + 0x88 ); // 3 19 35 51 11 27 43 59 67 + // 83 99 115 75 91 107 123 + t04 = _mm512_shuffle_f32x4( b00.v, b04.v, + 0xdd ); // 4 20 36 52 12 28 44 60 68 + // 84 100 116 76 92 108 124 + t05 = _mm512_shuffle_f32x4( b01.v, b05.v, + 0xdd ); // 5 21 37 53 13 29 45 61 69 + // 85 101 117 77 93 109 125 + t06 = _mm512_shuffle_f32x4( b02.v, b06.v, + 0xdd ); // 6 22 38 54 14 30 46 62 70 + // 86 102 118 78 94 110 126 + t07 = _mm512_shuffle_f32x4( b03.v, b07.v, + 0xdd ); // 7 23 39 55 15 31 47 63 71 + // 87 103 119 79 95 111 127 + t08 = _mm512_shuffle_f32x4( b08.v, b12.v, + 0x88 ); // 128 144 160 176 136 152 168 184 192 + // 208 224 240 200 216 232 248 + t09 = _mm512_shuffle_f32x4( b09.v, b13.v, + 0x88 ); // 129 145 161 177 137 153 169 185 193 + // 209 225 241 201 217 233 249 + t10 = _mm512_shuffle_f32x4( b10.v, b14.v, + 0x88 ); // 130 146 162 178 138 154 170 186 194 + // 210 226 242 202 218 234 250 + t11 = _mm512_shuffle_f32x4( b11.v, b15.v, + 0x88 ); // 131 147 163 179 139 155 171 187 195 + // 211 227 243 203 219 235 251 + t12 = _mm512_shuffle_f32x4( b08.v, b12.v, + 0xdd ); // 132 148 164 180 140 156 172 188 196 + // 212 228 244 204 220 236 252 + t13 = _mm512_shuffle_f32x4( b09.v, b13.v, + 0xdd ); // 133 149 165 181 141 157 173 189 197 + // 213 229 245 205 221 237 253 + t14 = _mm512_shuffle_f32x4( b10.v, b14.v, + 0xdd ); // 134 150 166 182 142 158 174 190 198 + // 214 230 246 206 222 238 254 + t15 = _mm512_shuffle_f32x4( b11.v, b15.v, + 0xdd ); // 135 151 167 183 143 159 175 191 199 + // 215 231 247 207 223 239 255 + + b00.v = + _mm512_permutexvar_ps( idx, t00 ); // 0 8 16 24 32 40 48 56 64 + // 72 80 88 96 104 112 120 + b01.v = + _mm512_permutexvar_ps( idx, t01 ); // 1 9 17 25 33 41 49 57 65 + // 73 81 89 97 105 113 121 + b02.v = + _mm512_permutexvar_ps( idx, t02 ); // 2 10 18 26 34 42 50 58 66 + // 74 82 90 98 106 114 122 + b03.v = + _mm512_permutexvar_ps( idx, t03 ); // 3 11 19 27 35 43 51 59 67 + // 75 83 91 99 107 115 123 + b04.v = + _mm512_permutexvar_ps( idx, t04 ); // 4 12 20 28 36 44 52 60 68 + // 76 84 92 100 108 116 124 + b05.v = + _mm512_permutexvar_ps( idx, t05 ); // 5 13 21 29 37 45 53 61 69 + // 77 85 93 101 109 117 125 + b06.v = + _mm512_permutexvar_ps( idx, t06 ); // 6 14 22 30 38 46 54 62 70 + // 78 86 94 102 110 118 126 + b07.v = + _mm512_permutexvar_ps( idx, t07 ); // 7 15 23 31 39 47 55 63 71 + // 79 87 95 103 111 119 127 + b08.v = + _mm512_permutexvar_ps( idx, t08 ); // 128 136 144 152 160 168 176 184 + // 192 200 208 216 224 232 240 248 + b09.v = + _mm512_permutexvar_ps( idx, t09 ); // 129 137 145 153 161 169 177 185 + // 193 201 209 217 225 233 241 249 + b10.v = + _mm512_permutexvar_ps( idx, t10 ); // 130 138 146 154 162 170 178 186 + // 194 202 210 218 226 234 242 250 + b11.v = + _mm512_permutexvar_ps( idx, t11 ); // 131 139 147 155 163 171 179 187 + // 195 203 211 219 227 235 243 251 + b12.v = + _mm512_permutexvar_ps( idx, t12 ); // 132 140 148 156 164 172 180 188 + // 196 204 212 220 228 236 244 252 + b13.v = + _mm512_permutexvar_ps( idx, t13 ); // 133 141 149 157 165 173 181 189 + // 197 205 213 221 229 237 245 253 + b14.v = + _mm512_permutexvar_ps( idx, t14 ); // 134 142 150 158 166 174 182 190 + // 198 206 214 222 230 238 246 254 + b15.v = + _mm512_permutexvar_ps( idx, t15 ); // 135 143 151 159 167 175 183 191 + // 199 207 215 223 231 239 247 255 +} + +inline void store_16x1_tr( const v16& a, void* a00, void* a01, void* a02, + void* a03, void* a04, void* a05, void* a06, + void* a07, void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, void* a14, + void* a15 ) +{ + ( (int*)a00 )[0] = a.i[0]; + ( (int*)a01 )[0] = a.i[1]; + ( (int*)a02 )[0] = a.i[2]; + ( (int*)a03 )[0] = a.i[3]; + ( (int*)a04 )[0] = a.i[4]; + ( (int*)a05 )[0] = a.i[5]; + ( (int*)a06 )[0] = a.i[6]; + ( (int*)a07 )[0] = a.i[7]; + ( (int*)a08 )[0] = a.i[8]; + ( (int*)a09 )[0] = a.i[9]; + ( (int*)a10 )[0] = a.i[10]; + ( (int*)a11 )[0] = a.i[11]; + ( (int*)a12 )[0] = a.i[12]; + ( (int*)a13 )[0] = a.i[13]; + ( (int*)a14 )[0] = a.i[14]; + ( (int*)a15 )[0] = a.i[15]; +} + +inline void store_16x2_tr( const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, + void* ALIGNED( 8 ) a03, void* ALIGNED( 8 ) a04, + void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, + void* ALIGNED( 8 ) a09, void* ALIGNED( 8 ) a10, + void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) +{ + ( (int* ALIGNED( 8 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a00 )[1] = b.i[0]; - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; + ( (int* ALIGNED( 8 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a01 )[1] = b.i[1]; - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; + ( (int* ALIGNED( 8 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a02 )[1] = b.i[2]; - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; + ( (int* ALIGNED( 8 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a03 )[1] = b.i[3]; - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; + ( (int* ALIGNED( 8 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a04 )[1] = b.i[4]; - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; + ( (int* ALIGNED( 8 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a05 )[1] = b.i[5]; - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; + ( (int* ALIGNED( 8 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a06 )[1] = b.i[6]; - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; + ( (int* ALIGNED( 8 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a07 )[1] = b.i[7]; - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; + ( (int* ALIGNED( 8 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 8 ))a08 )[1] = b.i[8]; - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; + ( (int* ALIGNED( 8 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 8 ))a09 )[1] = b.i[9]; - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; + ( (int* ALIGNED( 8 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 8 ))a10 )[1] = b.i[10]; - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; + ( (int* ALIGNED( 8 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 8 ))a11 )[1] = b.i[11]; - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; + ( (int* ALIGNED( 8 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 8 ))a12 )[1] = b.i[12]; - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - } + ( (int* ALIGNED( 8 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 8 ))a13 )[1] = b.i[13]; - inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - } + ( (int* ALIGNED( 8 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 8 ))a14 )[1] = b.i[14]; - inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - const v16 &e, const v16 &f, const v16 &g, const v16 &h, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - ((int * ALIGNED(64))a00)[4] = e.i[ 0]; - ((int * ALIGNED(64))a00)[5] = f.i[ 0]; - ((int * ALIGNED(64))a00)[6] = g.i[ 0]; - ((int * ALIGNED(64))a00)[7] = h.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - ((int * ALIGNED(64))a01)[4] = e.i[ 1]; - ((int * ALIGNED(64))a01)[5] = f.i[ 1]; - ((int * ALIGNED(64))a01)[6] = g.i[ 1]; - ((int * ALIGNED(64))a01)[7] = h.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - ((int * ALIGNED(64))a02)[4] = e.i[ 2]; - ((int * ALIGNED(64))a02)[5] = f.i[ 2]; - ((int * ALIGNED(64))a02)[6] = g.i[ 2]; - ((int * ALIGNED(64))a02)[7] = h.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - ((int * ALIGNED(64))a03)[4] = e.i[ 3]; - ((int * ALIGNED(64))a03)[5] = f.i[ 3]; - ((int * ALIGNED(64))a03)[6] = g.i[ 3]; - ((int * ALIGNED(64))a03)[7] = h.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - ((int * ALIGNED(64))a04)[4] = e.i[ 4]; - ((int * ALIGNED(64))a04)[5] = f.i[ 4]; - ((int * ALIGNED(64))a04)[6] = g.i[ 4]; - ((int * ALIGNED(64))a04)[7] = h.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - ((int * ALIGNED(64))a05)[4] = e.i[ 5]; - ((int * ALIGNED(64))a05)[5] = f.i[ 5]; - ((int * ALIGNED(64))a05)[6] = g.i[ 5]; - ((int * ALIGNED(64))a05)[7] = h.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - ((int * ALIGNED(64))a06)[4] = e.i[ 6]; - ((int * ALIGNED(64))a06)[5] = f.i[ 6]; - ((int * ALIGNED(64))a06)[6] = g.i[ 6]; - ((int * ALIGNED(64))a06)[7] = h.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - ((int * ALIGNED(64))a07)[4] = e.i[ 7]; - ((int * ALIGNED(64))a07)[5] = f.i[ 7]; - ((int * ALIGNED(64))a07)[6] = g.i[ 7]; - ((int * ALIGNED(64))a07)[7] = h.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - ((int * ALIGNED(64))a08)[4] = e.i[ 8]; - ((int * ALIGNED(64))a08)[5] = f.i[ 8]; - ((int * ALIGNED(64))a08)[6] = g.i[ 8]; - ((int * ALIGNED(64))a08)[7] = h.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - ((int * ALIGNED(64))a09)[4] = e.i[ 9]; - ((int * ALIGNED(64))a09)[5] = f.i[ 9]; - ((int * ALIGNED(64))a09)[6] = g.i[ 9]; - ((int * ALIGNED(64))a09)[7] = h.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - ((int * ALIGNED(64))a10)[4] = e.i[10]; - ((int * ALIGNED(64))a10)[5] = f.i[10]; - ((int * ALIGNED(64))a10)[6] = g.i[10]; - ((int * ALIGNED(64))a10)[7] = h.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - ((int * ALIGNED(64))a11)[4] = e.i[11]; - ((int * ALIGNED(64))a11)[5] = f.i[11]; - ((int * ALIGNED(64))a11)[6] = g.i[11]; - ((int * ALIGNED(64))a11)[7] = h.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - ((int * ALIGNED(64))a12)[4] = e.i[12]; - ((int * ALIGNED(64))a12)[5] = f.i[12]; - ((int * ALIGNED(64))a12)[6] = g.i[12]; - ((int * ALIGNED(64))a12)[7] = h.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - ((int * ALIGNED(64))a13)[4] = e.i[13]; - ((int * ALIGNED(64))a13)[5] = f.i[13]; - ((int * ALIGNED(64))a13)[6] = g.i[13]; - ((int * ALIGNED(64))a13)[7] = h.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - ((int * ALIGNED(64))a14)[4] = e.i[14]; - ((int * ALIGNED(64))a14)[5] = f.i[14]; - ((int * ALIGNED(64))a14)[6] = g.i[14]; - ((int * ALIGNED(64))a14)[7] = h.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - ((int * ALIGNED(64))a15)[4] = e.i[15]; - ((int * ALIGNED(64))a15)[5] = f.i[15]; - ((int * ALIGNED(64))a15)[6] = g.i[15]; - ((int * ALIGNED(64))a15)[7] = h.i[15]; - } + ( (int* ALIGNED( 8 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 8 ))a15 )[1] = b.i[15]; +} - inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15; - - // Start a00 = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - // a01 = 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - // a02 = 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - // a03 = 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - // a04 = 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - // a05 = 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - // a06 = 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - // a07 = 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - // a08 = 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - // a09 = 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - // a10 = 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - // a11 = 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - // a12 = 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - // a13 = 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - // a14 = 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - // a15 = 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - u00 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 - u01 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 - u02 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 - u03 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 - u04 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 - u05 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 - u06 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 - u07 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 - u08 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 - u09 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 - u10 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 - u11 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 - u12 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 228 240 - u13 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 229 241 - u14 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 230 242 - u15 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 231 243 - - t00 = _mm512_shuffle_f32x4( u00, u04, 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 ... +inline void store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; +} + +inline void store_16x4_tr( const v16& a, const v16& b, const v16& c, + const v16& d, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, + void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, + void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; +} + +inline void store_16x8_tr( + const v16& a, const v16& b, const v16& c, const v16& d, const v16& e, + const v16& f, const v16& g, const v16& h, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = e.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = f.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = g.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = h.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = e.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = f.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = g.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = h.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = e.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = f.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = g.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = h.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = e.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = f.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = g.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = h.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = e.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = f.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = g.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = h.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = e.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = f.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = g.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = h.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = e.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = f.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = g.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = h.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = e.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = f.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = g.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = h.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = e.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = f.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = g.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = h.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = e.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = f.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = g.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = h.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = e.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = f.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = g.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = h.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = e.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = f.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = g.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = h.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = e.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = f.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = g.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = h.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = e.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = f.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = g.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = h.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = e.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = f.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = g.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = h.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = e.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = f.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = g.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = h.i[15]; +} + +inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; + + // Start a00 = 0 1 2 3 4 5 6 + // 7 8 9 10 11 12 13 14 15 + // a01 = 16 17 18 19 20 21 22 + // 23 24 25 26 27 28 29 30 31 + // a02 = 32 33 34 35 36 37 38 + // 39 40 41 42 43 44 45 46 47 + // a03 = 48 49 50 51 52 53 54 + // 55 56 57 58 59 60 61 62 63 + // a04 = 64 65 66 67 68 69 70 + // 71 72 73 74 75 76 77 78 79 + // a05 = 80 81 82 83 84 85 86 + // 87 88 89 90 91 92 93 94 95 + // a06 = 96 97 98 99 100 101 102 + // 103 104 105 106 107 108 109 110 111 + // a07 = 112 113 114 115 116 117 118 + // 119 120 121 122 123 124 125 126 127 + // a08 = 128 129 130 131 132 133 134 + // 135 136 137 138 139 140 141 142 143 + // a09 = 144 145 146 147 148 149 150 + // 151 152 153 154 155 156 157 158 159 + // a10 = 160 161 162 163 164 165 166 + // 167 168 169 170 171 172 173 174 175 + // a11 = 176 177 178 179 180 181 182 + // 183 184 185 186 187 188 189 190 191 + // a12 = 192 193 194 195 196 197 198 + // 199 200 201 202 203 204 205 206 207 + // a13 = 208 209 210 211 212 213 214 + // 215 216 217 218 219 220 221 222 223 + // a14 = 224 225 226 227 228 229 230 + // 231 232 233 234 235 236 237 238 239 + // a15 = 240 241 242 243 244 245 246 + // 247 248 249 250 251 252 253 254 255 + + t00 = _mm512_unpacklo_ps( b00.v, b01.v ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_unpackhi_ps( b00.v, b01.v ); // 2 18 3 19 6 22 7 23 + // 10 26 11 27 14 30 15 31 + t02 = _mm512_unpacklo_ps( b02.v, b03.v ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_unpackhi_ps( b02.v, b03.v ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_unpacklo_ps( b04.v, b05.v ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_unpackhi_ps( b04.v, b05.v ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = + _mm512_unpacklo_ps( b06.v, b07.v ); // 96 112 97 113 100 116 101 117 + // 104 120 105 121 108 124 109 125 + t07 = + _mm512_unpackhi_ps( b06.v, b07.v ); // 98 114 99 115 102 118 103 119 + // 106 122 107 123 110 126 111 127 + t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 + // 136 152 137 153 140 156 141 157 + t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 + // 138 154 139 155 142 158 143 159 + t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 + // 168 184 169 185 172 188 173 189 + t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 + // 170 186 171 187 174 190 175 191 + t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 + // 200 216 201 217 204 220 205 221 + t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 + // 202 218 203 219 206 222 207 223 + t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 + // 232 248 233 249 236 252 237 253 + t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 + // 234 250 235 251 238 254 239 255 + + u00 = _mm512_shuffle_ps( t00, t02, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 32 48 + u01 = _mm512_shuffle_ps( t00, t02, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 1 17 33 49 + u02 = _mm512_shuffle_ps( t01, t03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 34 50 + u03 = _mm512_shuffle_ps( t01, t03, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 3 19 35 51 + u04 = _mm512_shuffle_ps( t04, t06, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 96 112 + u05 = _mm512_shuffle_ps( t04, t06, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 65 81 97 113 + u06 = _mm512_shuffle_ps( t05, t07, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 98 114 + u07 = _mm512_shuffle_ps( t05, t07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 67 83 99 115 + u08 = _mm512_shuffle_ps( t08, t10, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 160 176 + u09 = _mm512_shuffle_ps( t08, t10, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 129 145 161 177 + u10 = _mm512_shuffle_ps( t09, t11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 162 178 + u11 = _mm512_shuffle_ps( t09, t11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 131 147 163 179 + u12 = _mm512_shuffle_ps( t12, t14, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 228 240 + u13 = _mm512_shuffle_ps( t12, t14, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 193 209 229 241 + u14 = _mm512_shuffle_ps( t13, t15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 230 242 + u15 = _mm512_shuffle_ps( t13, t15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 195 211 231 243 + + t00 = _mm512_shuffle_f32x4( + u00, u04, + 0x88 ); // 0 16 32 48 8 24 40 56 64 80 96 112 ... t01 = _mm512_shuffle_f32x4( u01, u05, 0x88 ); // 1 17 33 49 ... t02 = _mm512_shuffle_f32x4( u02, u06, 0x88 ); // 2 18 34 50 ... t03 = _mm512_shuffle_f32x4( u03, u07, 0x88 ); // 3 19 35 51 ... @@ -1719,10 +2516,14 @@ namespace v16 t14 = _mm512_shuffle_f32x4( u10, u14, 0xdd ); // 134 150 166 182 ... t15 = _mm512_shuffle_f32x4( u11, u15, 0xdd ); // 135 151 167 183 ... - u00 = _mm512_shuffle_f32x4( t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 ... 240 - u01 = _mm512_shuffle_f32x4( t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 ... 241 - u02 = _mm512_shuffle_f32x4( t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 ... 242 - u03 = _mm512_shuffle_f32x4( t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 ... 243 + u00 = _mm512_shuffle_f32x4( + t00, t08, 0x88 ); // 0 16 32 48 64 80 96 112 ... 240 + u01 = _mm512_shuffle_f32x4( + t01, t09, 0x88 ); // 1 17 33 49 66 81 97 113 ... 241 + u02 = _mm512_shuffle_f32x4( + t02, t10, 0x88 ); // 2 18 34 50 67 82 98 114 ... 242 + u03 = _mm512_shuffle_f32x4( + t03, t11, 0x88 ); // 3 19 35 51 68 83 99 115 ... 243 u04 = _mm512_shuffle_f32x4( t04, t12, 0x88 ); // 4 ... u05 = _mm512_shuffle_f32x4( t05, t13, 0x88 ); // 5 ... u06 = _mm512_shuffle_f32x4( t06, t14, 0x88 ); // 6 ... @@ -1734,537 +2535,864 @@ namespace v16 u12 = _mm512_shuffle_f32x4( t04, t12, 0xdd ); // 12 ... u13 = _mm512_shuffle_f32x4( t05, t13, 0xdd ); // 13 ... u14 = _mm512_shuffle_f32x4( t06, t14, 0xdd ); // 14 ... - u15 = _mm512_shuffle_f32x4( t07, t15, 0xdd ); // 15 31 47 63 79 96 111 127 ... 255 - - _mm512_store_ps( (float *)a00, u00 ); - _mm512_store_ps( (float *)a01, u01 ); - _mm512_store_ps( (float *)a02, u02 ); - _mm512_store_ps( (float *)a03, u03 ); - _mm512_store_ps( (float *)a04, u04 ); - _mm512_store_ps( (float *)a05, u05 ); - _mm512_store_ps( (float *)a06, u06 ); - _mm512_store_ps( (float *)a07, u07 ); - _mm512_store_ps( (float *)a08, u08 ); - _mm512_store_ps( (float *)a09, u09 ); - _mm512_store_ps( (float *)a10, u10 ); - _mm512_store_ps( (float *)a11, u11 ); - _mm512_store_ps( (float *)a12, u12 ); - _mm512_store_ps( (float *)a13, u13 ); - _mm512_store_ps( (float *)a14, u14 ); - _mm512_store_ps( (float *)a15, u15 ); - } - - // This is the reference AVX-512 implementation. - inline void store_16x8_tr_p( const v16 &b00, - const v16 &b01, - const v16 &b02, - const v16 &b03, - const v16 &b04, - const v16 &b05, - const v16 &b06, - const v16 &b07, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) - { + u15 = _mm512_shuffle_f32x4( + t07, t15, 0xdd ); // 15 31 47 63 79 96 111 127 ... 255 + + _mm512_store_ps( (float*)a00, u00 ); + _mm512_store_ps( (float*)a01, u01 ); + _mm512_store_ps( (float*)a02, u02 ); + _mm512_store_ps( (float*)a03, u03 ); + _mm512_store_ps( (float*)a04, u04 ); + _mm512_store_ps( (float*)a05, u05 ); + _mm512_store_ps( (float*)a06, u06 ); + _mm512_store_ps( (float*)a07, u07 ); + _mm512_store_ps( (float*)a08, u08 ); + _mm512_store_ps( (float*)a09, u09 ); + _mm512_store_ps( (float*)a10, u10 ); + _mm512_store_ps( (float*)a11, u11 ); + _mm512_store_ps( (float*)a12, u12 ); + _mm512_store_ps( (float*)a13, u13 ); + _mm512_store_ps( (float*)a14, u14 ); + _mm512_store_ps( (float*)a15, u15 ); +} + +// This is the reference AVX-512 implementation. +inline void store_16x8_tr_p( const v16& b00, const v16& b01, const v16& b02, + const v16& b03, const v16& b04, const v16& b05, + const v16& b06, const v16& b07, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07 ) +{ __m512 t00, t01, t02, t03, t04, t05, t06, t07; __m512 u00, u01, u02, u03, u04, u05, u06, u07; - __m512i idx = _mm512_set_epi32( 15, 13, 11, 9, 14, 12, 10, 8, 7, 5, 3, 1, 6, 4, 2, 0 ); + __m512i idx = _mm512_set_epi32( 15, 13, 11, 9, 14, 12, 10, 8, 7, 5, 3, 1, 6, + 4, 2, 0 ); __m512i idx1, idx2; - // Start b00 = 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 - // b01 = 1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121 - // b02 = 2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122 - // b03 = 3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123 - // b04 = 4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124 - // b05 = 5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125 - // b06 = 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126 - // b07 = 7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127 - - t00 = _mm512_permutexvar_ps( idx, b00.v ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_permutexvar_ps( idx, b01.v ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_permutexvar_ps( idx, b02.v ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_permutexvar_ps( idx, b03.v ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t04 = _mm512_permutexvar_ps( idx, b04.v ); // 4 20 36 52 12 28 44 60 68 84 100 116 76 92 108 124 - t05 = _mm512_permutexvar_ps( idx, b05.v ); // 5 21 37 53 13 29 45 61 69 85 101 117 77 93 109 125 - t06 = _mm512_permutexvar_ps( idx, b06.v ); // 6 22 38 54 14 30 46 62 70 86 102 118 78 94 110 126 - t07 = _mm512_permutexvar_ps( idx, b07.v ); // 7 23 39 55 15 31 47 63 71 87 103 119 79 95 111 127 - - idx1 = _mm512_set_epi32( 7+16, 6+16, 5+16, 4+16, 7, 6, 5, 4, 3+16, 2+16, 1+16, 0+16, 3, 2, 1, 0 ); - idx2 = _mm512_set_epi32( 15+16, 14+16, 13+16, 12+16, 15, 14, 13, 12, 11+16, 10+16, 9+16, 8+16, 11, 10, 9, 8 ); - - u00 = _mm512_permutex2var_ps( t00, idx1, t04 ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - u01 = _mm512_permutex2var_ps( t01, idx1, t05 ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - u02 = _mm512_permutex2var_ps( t02, idx1, t06 ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - u03 = _mm512_permutex2var_ps( t03, idx1, t07 ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - u04 = _mm512_permutex2var_ps( t00, idx2, t04 ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - u05 = _mm512_permutex2var_ps( t01, idx2, t05 ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - u06 = _mm512_permutex2var_ps( t02, idx2, t06 ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - u07 = _mm512_permutex2var_ps( t03, idx2, t07 ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - - t00 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - - u00 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - u01 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - u02 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - u03 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - u04 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - u05 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - u06 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - u07 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - - _mm512_store_ps( (float *)a00, u00 ); - _mm512_store_ps( (float *)a01, u01 ); - _mm512_store_ps( (float *)a02, u02 ); - _mm512_store_ps( (float *)a03, u03 ); - _mm512_store_ps( (float *)a04, u04 ); - _mm512_store_ps( (float *)a05, u05 ); - _mm512_store_ps( (float *)a06, u06 ); - _mm512_store_ps( (float *)a07, u07 ); - } - - // This is the reference AVX-512 implementation. - inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15; - __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15; + // Start b00 = 0 8 + // 16 24 32 40 48 56 64 72 80 88 96 104 112 120 + // b01 = 1 9 + // 17 25 33 41 + // 49 57 65 73 + // 81 89 97 105 + // 113 121 b02 = + // 2 10 18 26 + // 34 42 50 58 + // 66 74 82 90 + // 98 106 114 122 + // b03 = 3 11 + // 19 27 35 43 + // 51 59 67 75 + // 83 91 99 107 + // 115 123 b04 = + // 4 12 20 28 + // 36 44 52 60 + // 68 76 84 92 + // 100 108 116 124 + // b05 = 5 13 + // 21 29 37 45 + // 53 61 69 77 + // 85 93 101 109 + // 117 125 b06 = + // 6 14 22 30 + // 38 46 54 62 + // 70 78 86 94 + // 102 110 118 126 + // b07 = 7 15 + // 23 31 39 47 + // 55 63 71 79 + // 87 95 103 111 + // 119 127 + + t00 = + _mm512_permutexvar_ps( idx, b00.v ); // 0 16 32 48 8 24 40 56 + // 64 80 96 112 72 88 104 120 + t01 = + _mm512_permutexvar_ps( idx, b01.v ); // 1 17 33 49 9 25 41 57 + // 65 81 97 113 73 89 105 121 + t02 = + _mm512_permutexvar_ps( idx, b02.v ); // 2 18 34 50 10 26 42 58 + // 66 82 98 114 74 90 106 122 + t03 = + _mm512_permutexvar_ps( idx, b03.v ); // 3 19 35 51 11 27 43 59 + // 67 83 99 115 75 91 107 123 + t04 = + _mm512_permutexvar_ps( idx, b04.v ); // 4 20 36 52 12 28 44 60 + // 68 84 100 116 76 92 108 124 + t05 = + _mm512_permutexvar_ps( idx, b05.v ); // 5 21 37 53 13 29 45 61 + // 69 85 101 117 77 93 109 125 + t06 = + _mm512_permutexvar_ps( idx, b06.v ); // 6 22 38 54 14 30 46 62 + // 70 86 102 118 78 94 110 126 + t07 = + _mm512_permutexvar_ps( idx, b07.v ); // 7 23 39 55 15 31 47 63 + // 71 87 103 119 79 95 111 127 + + idx1 = _mm512_set_epi32( 7 + 16, 6 + 16, 5 + 16, 4 + 16, 7, 6, 5, 4, 3 + 16, + 2 + 16, 1 + 16, 0 + 16, 3, 2, 1, 0 ); + idx2 = _mm512_set_epi32( 15 + 16, 14 + 16, 13 + 16, 12 + 16, 15, 14, 13, 12, + 11 + 16, 10 + 16, 9 + 16, 8 + 16, 11, 10, 9, 8 ); + + u00 = _mm512_permutex2var_ps( + t00, idx1, + t04 ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 + u01 = _mm512_permutex2var_ps( + t01, idx1, + t05 ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 + u02 = _mm512_permutex2var_ps( + t02, idx1, + t06 ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 + u03 = _mm512_permutex2var_ps( + t03, idx1, + t07 ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 + u04 = _mm512_permutex2var_ps( t00, idx2, + t04 ); // 64 80 96 112 68 84 100 116 72 + // 88 104 120 76 92 108 124 + u05 = _mm512_permutex2var_ps( t01, idx2, + t05 ); // 65 81 97 113 69 85 101 117 73 + // 89 105 121 77 93 109 125 + u06 = _mm512_permutex2var_ps( t02, idx2, + t06 ); // 66 82 98 114 70 86 102 118 74 + // 90 106 122 78 94 110 126 + u07 = _mm512_permutex2var_ps( t03, idx2, + t07 ); // 67 83 99 115 71 87 103 119 75 + // 91 107 123 79 95 111 127 + + t00 = _mm512_shuffle_ps( + u00, u01, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_shuffle_ps( + u02, u03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 3 19 6 22 7 23 10 26 + // 11 27 14 30 15 31 + t02 = _mm512_shuffle_ps( + u00, u01, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_shuffle_ps( + u02, u03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_shuffle_ps( + u04, u05, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_shuffle_ps( + u06, u07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = _mm512_shuffle_ps( + u04, u05, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 96 112 97 113 100 116 101 117 104 120 + // 105 121 108 124 109 125 + t07 = _mm512_shuffle_ps( + u06, u07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 98 114 99 115 102 118 103 119 106 122 + // 107 123 110 126 111 127 + + u00 = _mm512_shuffle_ps( + t00, t01, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + u01 = _mm512_shuffle_ps( + t00, t01, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + u02 = _mm512_shuffle_ps( + t02, t03, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + u03 = _mm512_shuffle_ps( + t02, t03, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + u04 = _mm512_shuffle_ps( + t04, t05, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + u05 = _mm512_shuffle_ps( + t04, t05, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + u06 = _mm512_shuffle_ps( + t06, t07, + _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 96 97 98 99 100 101 102 103 104 105 + // 106 107 108 109 110 111 + u07 = _mm512_shuffle_ps( + t06, t07, + _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 112 113 114 115 116 117 118 119 120 121 + // 122 123 124 125 126 127 + + _mm512_store_ps( (float*)a00, u00 ); + _mm512_store_ps( (float*)a01, u01 ); + _mm512_store_ps( (float*)a02, u02 ); + _mm512_store_ps( (float*)a03, u03 ); + _mm512_store_ps( (float*)a04, u04 ); + _mm512_store_ps( (float*)a05, u05 ); + _mm512_store_ps( (float*)a06, u06 ); + _mm512_store_ps( (float*)a07, u07 ); +} + +// This is the reference AVX-512 implementation. +inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, + t14, t15; + __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; - __m512i idx = _mm512_set_epi32( 15, 13, 11, 9, 14, 12, 10, 8, 7, 5, 3, 1, 6, 4, 2, 0 ); + __m512i idx = _mm512_set_epi32( 15, 13, 11, 9, 14, 12, 10, 8, 7, 5, 3, 1, 6, + 4, 2, 0 ); __m512i idx1, idx2; - // Start b00 = 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 - // b01 = 1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121 - // b02 = 2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122 - // b03 = 3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123 - // b04 = 4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124 - // b05 = 5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125 - // b06 = 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126 - // b07 = 7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127 - // b08 = 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248 - // b09 = 129 137 145 153 161 169 177 185 193 201 209 217 225 233 241 249 - // b10 = 130 138 146 154 162 170 178 186 194 202 210 218 226 234 242 250 - // b11 = 131 139 147 155 163 171 179 187 195 203 211 219 227 235 243 251 - // b12 = 132 140 148 156 164 172 180 188 196 204 212 220 228 236 244 252 - // b13 = 133 141 149 157 165 173 181 189 197 205 213 221 229 237 245 253 - // b14 = 134 142 150 158 166 174 182 190 198 206 214 222 230 238 246 254 - // b15 = 135 143 151 159 167 175 183 191 199 207 215 223 231 239 247 255 - - t00 = _mm512_permutexvar_ps( idx, b00.v ); // 0 16 32 48 8 24 40 56 64 80 96 112 72 88 104 120 - t01 = _mm512_permutexvar_ps( idx, b01.v ); // 1 17 33 49 9 25 41 57 65 81 97 113 73 89 105 121 - t02 = _mm512_permutexvar_ps( idx, b02.v ); // 2 18 34 50 10 26 42 58 66 82 98 114 74 90 106 122 - t03 = _mm512_permutexvar_ps( idx, b03.v ); // 3 19 35 51 11 27 43 59 67 83 99 115 75 91 107 123 - t04 = _mm512_permutexvar_ps( idx, b04.v ); // 4 20 36 52 12 28 44 60 68 84 100 116 76 92 108 124 - t05 = _mm512_permutexvar_ps( idx, b05.v ); // 5 21 37 53 13 29 45 61 69 85 101 117 77 93 109 125 - t06 = _mm512_permutexvar_ps( idx, b06.v ); // 6 22 38 54 14 30 46 62 70 86 102 118 78 94 110 126 - t07 = _mm512_permutexvar_ps( idx, b07.v ); // 7 23 39 55 15 31 47 63 71 87 103 119 79 95 111 127 - t08 = _mm512_permutexvar_ps( idx, b08.v ); // 128 144 160 176 136 152 168 184 192 208 228 240 200 216 232 248 - t09 = _mm512_permutexvar_ps( idx, b09.v ); // 129 145 161 177 137 153 169 185 193 209 229 241 201 217 233 249 - t10 = _mm512_permutexvar_ps( idx, b10.v ); // 130 146 162 178 138 154 170 186 194 210 230 242 202 218 234 250 - t11 = _mm512_permutexvar_ps( idx, b11.v ); // 131 147 163 179 139 155 171 187 195 211 231 243 203 219 235 251 - t12 = _mm512_permutexvar_ps( idx, b12.v ); // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252 - t13 = _mm512_permutexvar_ps( idx, b13.v ); // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253 - t14 = _mm512_permutexvar_ps( idx, b14.v ); // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254 - t15 = _mm512_permutexvar_ps( idx, b15.v ); // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255 - - idx1 = _mm512_set_epi32( 7+16, 6+16, 5+16, 4+16, 7, 6, 5, 4, 3+16, 2+16, 1+16, 0+16, 3, 2, 1, 0 ); - idx2 = _mm512_set_epi32( 15+16, 14+16, 13+16, 12+16, 15, 14, 13, 12, 11+16, 10+16, 9+16, 8+16, 11, 10, 9, 8 ); - - u00 = _mm512_permutex2var_ps( t00, idx1, t04 ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 - u01 = _mm512_permutex2var_ps( t01, idx1, t05 ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 - u02 = _mm512_permutex2var_ps( t02, idx1, t06 ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 - u03 = _mm512_permutex2var_ps( t03, idx1, t07 ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 - u04 = _mm512_permutex2var_ps( t00, idx2, t04 ); // 64 80 96 112 68 84 100 116 72 88 104 120 76 92 108 124 - u05 = _mm512_permutex2var_ps( t01, idx2, t05 ); // 65 81 97 113 69 85 101 117 73 89 105 121 77 93 109 125 - u06 = _mm512_permutex2var_ps( t02, idx2, t06 ); // 66 82 98 114 70 86 102 118 74 90 106 122 78 94 110 126 - u07 = _mm512_permutex2var_ps( t03, idx2, t07 ); // 67 83 99 115 71 87 103 119 75 91 107 123 79 95 111 127 - u08 = _mm512_permutex2var_ps( t08, idx1, t12 ); // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188 - u09 = _mm512_permutex2var_ps( t09, idx1, t13 ); // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189 - u10 = _mm512_permutex2var_ps( t10, idx1, t14 ); // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190 - u11 = _mm512_permutex2var_ps( t11, idx1, t15 ); // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191 - u12 = _mm512_permutex2var_ps( t08, idx2, t12 ); // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252 - u13 = _mm512_permutex2var_ps( t09, idx2, t13 ); // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253 - u14 = _mm512_permutex2var_ps( t10, idx2, t14 ); // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254 - u15 = _mm512_permutex2var_ps( t11, idx2, t15 ); // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255 - - t00 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t01 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t02 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 32 48 33 49 36 52 37 53 40 56 41 57 44 60 45 61 - t03 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 34 50 35 51 38 54 39 55 42 58 43 59 46 62 47 63 - t04 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 65 81 68 84 69 85 72 88 73 89 76 92 77 93 - t05 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 67 83 70 86 71 87 74 90 75 91 78 94 79 95 - t06 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 96 112 97 113 100 116 101 117 104 120 105 121 108 124 109 125 - t07 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 98 114 99 115 102 118 103 119 106 122 107 123 110 126 111 127 - t08 = _mm512_shuffle_ps( u08, u09, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157 - t09 = _mm512_shuffle_ps( u10, u11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159 - t10 = _mm512_shuffle_ps( u08, u09, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189 - t11 = _mm512_shuffle_ps( u10, u11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191 - t12 = _mm512_shuffle_ps( u12, u13, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221 - t13 = _mm512_shuffle_ps( u14, u15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223 - t14 = _mm512_shuffle_ps( u12, u13, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253 - t15 = _mm512_shuffle_ps( u14, u15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255 - - u00 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - u01 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - u02 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 - u03 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 - u04 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 - u05 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - u06 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 - u07 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 - u08 = _mm512_shuffle_ps( t08, t09, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 - u09 = _mm512_shuffle_ps( t08, t09, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 - u10 = _mm512_shuffle_ps( t10, t11, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 - u11 = _mm512_shuffle_ps( t10, t11, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 - u12 = _mm512_shuffle_ps( t12, t13, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 - u13 = _mm512_shuffle_ps( t12, t13, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 - u14 = _mm512_shuffle_ps( t14, t15, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 - u15 = _mm512_shuffle_ps( t14, t15, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 - - _mm512_store_ps( (float *)a00, u00 ); - _mm512_store_ps( (float *)a01, u01 ); - _mm512_store_ps( (float *)a02, u02 ); - _mm512_store_ps( (float *)a03, u03 ); - _mm512_store_ps( (float *)a04, u04 ); - _mm512_store_ps( (float *)a05, u05 ); - _mm512_store_ps( (float *)a06, u06 ); - _mm512_store_ps( (float *)a07, u07 ); - _mm512_store_ps( (float *)a08, u08 ); - _mm512_store_ps( (float *)a09, u09 ); - _mm512_store_ps( (float *)a10, u10 ); - _mm512_store_ps( (float *)a11, u11 ); - _mm512_store_ps( (float *)a12, u12 ); - _mm512_store_ps( (float *)a13, u13 ); - _mm512_store_ps( (float *)a14, u14 ); - _mm512_store_ps( (float *)a15, u15 ); - } - - ////////////// - // v16int class - - class v16int : public v16 - { + // Start b00 = 0 8 + // 16 24 32 40 48 56 64 72 80 88 96 104 112 120 + // b01 = 1 9 + // 17 25 33 41 + // 49 57 65 73 + // 81 89 97 105 + // 113 121 b02 = + // 2 10 18 26 + // 34 42 50 58 + // 66 74 82 90 + // 98 106 114 122 + // b03 = 3 11 + // 19 27 35 43 + // 51 59 67 75 + // 83 91 99 107 + // 115 123 b04 = + // 4 12 20 28 + // 36 44 52 60 + // 68 76 84 92 + // 100 108 116 124 + // b05 = 5 13 + // 21 29 37 45 + // 53 61 69 77 + // 85 93 101 109 + // 117 125 b06 = + // 6 14 22 30 + // 38 46 54 62 + // 70 78 86 94 + // 102 110 118 126 + // b07 = 7 15 + // 23 31 39 47 + // 55 63 71 79 + // 87 95 103 111 + // 119 127 b08 = + // 128 136 144 152 + // 160 168 176 184 + // 192 200 208 216 + // 224 232 240 248 + // b09 = 129 137 + // 145 153 161 169 + // 177 185 193 201 + // 209 217 225 233 + // 241 249 b10 = + // 130 138 146 154 + // 162 170 178 186 + // 194 202 210 218 + // 226 234 242 250 + // b11 = 131 139 + // 147 155 163 171 + // 179 187 195 203 + // 211 219 227 235 + // 243 251 b12 = + // 132 140 148 156 + // 164 172 180 188 + // 196 204 212 220 + // 228 236 244 252 + // b13 = 133 141 + // 149 157 165 173 + // 181 189 197 205 + // 213 221 229 237 + // 245 253 b14 = + // 134 142 150 158 + // 166 174 182 190 + // 198 206 214 222 + // 230 238 246 254 + // b15 = 135 143 + // 151 159 167 175 + // 183 191 199 207 + // 215 223 231 239 + // 247 255 + + t00 = + _mm512_permutexvar_ps( idx, b00.v ); // 0 16 32 48 8 24 40 56 + // 64 80 96 112 72 88 104 120 + t01 = + _mm512_permutexvar_ps( idx, b01.v ); // 1 17 33 49 9 25 41 57 + // 65 81 97 113 73 89 105 121 + t02 = + _mm512_permutexvar_ps( idx, b02.v ); // 2 18 34 50 10 26 42 58 + // 66 82 98 114 74 90 106 122 + t03 = + _mm512_permutexvar_ps( idx, b03.v ); // 3 19 35 51 11 27 43 59 + // 67 83 99 115 75 91 107 123 + t04 = + _mm512_permutexvar_ps( idx, b04.v ); // 4 20 36 52 12 28 44 60 + // 68 84 100 116 76 92 108 124 + t05 = + _mm512_permutexvar_ps( idx, b05.v ); // 5 21 37 53 13 29 45 61 + // 69 85 101 117 77 93 109 125 + t06 = + _mm512_permutexvar_ps( idx, b06.v ); // 6 22 38 54 14 30 46 62 + // 70 86 102 118 78 94 110 126 + t07 = + _mm512_permutexvar_ps( idx, b07.v ); // 7 23 39 55 15 31 47 63 + // 71 87 103 119 79 95 111 127 + t08 = + _mm512_permutexvar_ps( idx, b08.v ); // 128 144 160 176 136 152 168 184 + // 192 208 228 240 200 216 232 248 + t09 = + _mm512_permutexvar_ps( idx, b09.v ); // 129 145 161 177 137 153 169 185 + // 193 209 229 241 201 217 233 249 + t10 = + _mm512_permutexvar_ps( idx, b10.v ); // 130 146 162 178 138 154 170 186 + // 194 210 230 242 202 218 234 250 + t11 = + _mm512_permutexvar_ps( idx, b11.v ); // 131 147 163 179 139 155 171 187 + // 195 211 231 243 203 219 235 251 + t12 = + _mm512_permutexvar_ps( idx, b12.v ); // 132 148 164 180 140 156 172 188 + // 196 212 228 244 204 220 236 252 + t13 = + _mm512_permutexvar_ps( idx, b13.v ); // 133 149 165 181 141 157 173 189 + // 197 213 229 245 205 221 237 253 + t14 = + _mm512_permutexvar_ps( idx, b14.v ); // 134 150 166 182 142 158 174 190 + // 198 214 230 246 206 222 238 254 + t15 = + _mm512_permutexvar_ps( idx, b15.v ); // 135 151 167 183 143 159 175 191 + // 199 215 231 247 207 223 239 255 + + idx1 = _mm512_set_epi32( 7 + 16, 6 + 16, 5 + 16, 4 + 16, 7, 6, 5, 4, 3 + 16, + 2 + 16, 1 + 16, 0 + 16, 3, 2, 1, 0 ); + idx2 = _mm512_set_epi32( 15 + 16, 14 + 16, 13 + 16, 12 + 16, 15, 14, 13, 12, + 11 + 16, 10 + 16, 9 + 16, 8 + 16, 11, 10, 9, 8 ); + + u00 = _mm512_permutex2var_ps( + t00, idx1, + t04 ); // 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 + u01 = _mm512_permutex2var_ps( + t01, idx1, + t05 ); // 1 17 33 49 5 21 37 53 9 25 41 57 13 29 45 61 + u02 = _mm512_permutex2var_ps( + t02, idx1, + t06 ); // 2 18 34 50 6 22 38 54 10 26 42 58 14 30 46 62 + u03 = _mm512_permutex2var_ps( + t03, idx1, + t07 ); // 3 19 35 51 7 23 39 55 11 27 43 59 15 31 47 63 + u04 = _mm512_permutex2var_ps( t00, idx2, + t04 ); // 64 80 96 112 68 84 100 116 72 + // 88 104 120 76 92 108 124 + u05 = _mm512_permutex2var_ps( t01, idx2, + t05 ); // 65 81 97 113 69 85 101 117 73 + // 89 105 121 77 93 109 125 + u06 = _mm512_permutex2var_ps( t02, idx2, + t06 ); // 66 82 98 114 70 86 102 118 74 + // 90 106 122 78 94 110 126 + u07 = _mm512_permutex2var_ps( t03, idx2, + t07 ); // 67 83 99 115 71 87 103 119 75 + // 91 107 123 79 95 111 127 + u08 = _mm512_permutex2var_ps( t08, idx1, + t12 ); // 128 144 160 176 132 148 164 180 136 + // 152 168 184 140 156 172 188 + u09 = _mm512_permutex2var_ps( t09, idx1, + t13 ); // 129 145 161 177 133 149 165 181 137 + // 153 169 185 141 157 173 189 + u10 = _mm512_permutex2var_ps( t10, idx1, + t14 ); // 130 146 162 178 134 150 166 182 138 + // 154 170 186 142 158 174 190 + u11 = _mm512_permutex2var_ps( t11, idx1, + t15 ); // 131 147 163 179 135 151 167 183 139 + // 155 171 187 143 159 175 191 + u12 = _mm512_permutex2var_ps( t08, idx2, + t12 ); // 192 208 224 240 196 212 228 244 200 + // 216 232 248 204 220 236 252 + u13 = _mm512_permutex2var_ps( t09, idx2, + t13 ); // 193 209 225 241 197 213 229 245 201 + // 217 233 249 205 221 237 253 + u14 = _mm512_permutex2var_ps( t10, idx2, + t14 ); // 194 210 226 242 198 214 230 246 202 + // 218 234 250 206 222 238 254 + u15 = _mm512_permutex2var_ps( t11, idx2, + t15 ); // 195 211 227 243 199 215 231 247 203 + // 219 235 251 207 223 239 255 + + t00 = _mm512_shuffle_ps( + u00, u01, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 0 16 1 17 4 20 5 21 + // 8 24 9 25 12 28 13 29 + t01 = _mm512_shuffle_ps( + u02, u03, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 2 18 3 19 6 22 7 23 10 26 + // 11 27 14 30 15 31 + t02 = _mm512_shuffle_ps( + u00, u01, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 32 48 33 49 36 52 37 53 + // 40 56 41 57 44 60 45 61 + t03 = _mm512_shuffle_ps( + u02, u03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 34 50 35 51 38 54 39 55 + // 42 58 43 59 46 62 47 63 + t04 = _mm512_shuffle_ps( + u04, u05, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 64 80 65 81 68 84 69 85 + // 72 88 73 89 76 92 77 93 + t05 = _mm512_shuffle_ps( + u06, u07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 66 82 67 83 70 86 71 87 + // 74 90 75 91 78 94 79 95 + t06 = _mm512_shuffle_ps( + u04, u05, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 96 112 97 113 100 116 101 117 104 120 + // 105 121 108 124 109 125 + t07 = _mm512_shuffle_ps( + u06, u07, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 98 114 99 115 102 118 103 119 106 122 + // 107 123 110 126 111 127 + t08 = _mm512_shuffle_ps( + u08, u09, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 129 145 132 148 133 149 136 152 + // 137 153 140 156 141 157 + t09 = _mm512_shuffle_ps( + u10, u11, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 131 147 134 150 135 151 138 154 + // 139 155 142 158 143 159 + t10 = _mm512_shuffle_ps( + u08, u09, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 160 176 161 177 164 180 165 181 168 184 + // 169 185 172 188 173 189 + t11 = _mm512_shuffle_ps( + u10, u11, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 162 178 163 179 166 182 167 183 170 186 + // 171 187 174 190 175 191 + t12 = _mm512_shuffle_ps( + u12, u13, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 193 209 196 212 197 213 200 216 + // 201 217 204 220 205 221 + t13 = _mm512_shuffle_ps( + u14, u15, + _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 195 211 198 214 199 215 202 218 + // 203 219 206 222 207 223 + t14 = _mm512_shuffle_ps( + u12, u13, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 224 240 225 241 228 244 229 245 232 248 + // 233 249 236 252 237 253 + t15 = _mm512_shuffle_ps( + u14, u15, + _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 226 242 227 243 230 246 231 247 234 250 + // 235 251 238 254 239 255 + + u00 = _mm512_shuffle_ps( + t00, t01, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 0 1 2 3 4 5 6 7 + // 8 9 10 11 12 13 14 15 + u01 = _mm512_shuffle_ps( + t00, t01, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 16 17 18 19 20 21 22 23 + // 24 25 26 27 28 29 30 31 + u02 = _mm512_shuffle_ps( + t02, t03, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 32 33 34 35 36 37 38 39 + // 40 41 42 43 44 45 46 47 + u03 = _mm512_shuffle_ps( + t02, t03, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 48 49 50 51 52 53 54 55 + // 56 57 58 59 60 61 62 63 + u04 = _mm512_shuffle_ps( + t04, t05, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 64 65 66 67 68 69 70 71 + // 72 73 74 75 76 77 78 79 + u05 = _mm512_shuffle_ps( + t04, t05, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 80 81 82 83 84 85 86 87 + // 88 89 90 91 92 93 94 95 + u06 = _mm512_shuffle_ps( + t06, t07, + _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 96 97 98 99 100 101 102 103 104 105 + // 106 107 108 109 110 111 + u07 = _mm512_shuffle_ps( + t06, t07, + _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 112 113 114 115 116 117 118 119 120 121 + // 122 123 124 125 126 127 + u08 = _mm512_shuffle_ps( + t08, t09, + _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 128 129 130 131 132 133 134 135 136 137 + // 138 139 140 141 142 143 + u09 = _mm512_shuffle_ps( + t08, t09, + _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 144 145 146 147 148 149 150 151 152 153 + // 154 155 156 157 158 159 + u10 = _mm512_shuffle_ps( + t10, t11, + _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 160 161 162 163 164 165 166 167 168 169 + // 170 171 172 173 174 175 + u11 = _mm512_shuffle_ps( + t10, t11, + _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 176 177 178 179 180 181 182 183 184 185 + // 186 187 188 189 190 191 + u12 = _mm512_shuffle_ps( + t12, t13, + _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 192 193 194 195 196 197 198 199 200 201 + // 202 203 204 205 206 207 + u13 = _mm512_shuffle_ps( + t12, t13, + _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 208 209 210 211 212 213 214 215 216 217 + // 218 219 220 221 222 223 + u14 = _mm512_shuffle_ps( + t14, t15, + _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 224 225 226 227 228 229 230 231 232 233 + // 234 235 236 237 238 239 + u15 = _mm512_shuffle_ps( + t14, t15, + _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 240 241 242 243 244 245 246 247 248 249 + // 250 251 252 253 254 255 + + _mm512_store_ps( (float*)a00, u00 ); + _mm512_store_ps( (float*)a01, u01 ); + _mm512_store_ps( (float*)a02, u02 ); + _mm512_store_ps( (float*)a03, u03 ); + _mm512_store_ps( (float*)a04, u04 ); + _mm512_store_ps( (float*)a05, u05 ); + _mm512_store_ps( (float*)a06, u06 ); + _mm512_store_ps( (float*)a07, u07 ); + _mm512_store_ps( (float*)a08, u08 ); + _mm512_store_ps( (float*)a09, u09 ); + _mm512_store_ps( (float*)a10, u10 ); + _mm512_store_ps( (float*)a11, u11 ); + _mm512_store_ps( (float*)a12, u12 ); + _mm512_store_ps( (float*)a13, u13 ); + _mm512_store_ps( (float*)a14, u14 ); + _mm512_store_ps( (float*)a15, u15 ); +} + +////////////// +// v16int class + +class v16int : public v16 +{ // v16int prefix unary operator friends - friend inline v16int operator +( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator ~( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16int & a ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator~( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16int prefix increment / decrement operator friends - friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a ) ALWAYS_INLINE; // v16int postfix increment / decrement operator friends - friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a, int ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a, int ) ALWAYS_INLINE; // v16int binary operator friends - friend inline v16int operator +( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator *( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator /( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator %( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ^( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator |( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator*(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator/( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator%( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator^( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator|( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int logical operator friends - friend inline v16int operator <( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16int abs( const v16int &a ) ALWAYS_INLINE; - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; + friend inline v16int abs( const v16int& a ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& t, + const v16& f ) ALWAYS_INLINE; // v16float unary operator friends - friend inline v16int operator !( const v16float & a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float miscellaneous friends - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; public: - // v16int constructors / destructors - v16int() {} // Default constructor + v16int() {} // Default constructor - v16int( const v16int &a ) // Copy constructor + v16int( const v16int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v16int( const v16 &a ) // Init from mixed + v16int( const v16& a ) // Init from mixed { - v = a.v; + v = a.v; } - v16int( int a ) // Init from scalar + v16int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; - u.i = a; - v = _mm512_set1_ps( u.f ); + union { + int i; + float f; + } u; + u.i = a; + v = _mm512_set1_ps( u.f ); } - v16int( int i00, int i01, int i02, int i03, - int i04, int i05, int i06, int i07, - int i08, int i09, int i10, int i11, - int i12, int i13, int i14, int i15 ) // Init from scalars + v16int( int i00, int i01, int i02, int i03, int i04, int i05, int i06, + int i07, int i08, int i09, int i10, int i11, int i12, int i13, + int i14, int i15 ) // Init from scalars { - union - { - int i; - float f; - } u00, u01, u02, u03, u04, u05, u06, u07, - u08, u09, u10, u11, u12, u13, u14, u15; - - u00.i = i00; u01.i = i01; u02.i = i02; u03.i = i03; - u04.i = i04; u05.i = i05; u06.i = i06; u07.i = i07; - u08.i = i08; u09.i = i09; u10.i = i10; u11.i = i11; - u12.i = i12; u13.i = i13; u14.i = i14; u15.i = i15; - - v = _mm512_setr_ps( u00.f, u01.f, u02.f, u03.f, - u04.f, u05.f, u06.f, u07.f, - u08.f, u09.f, u10.f, u11.f, - u12.f, u13.f, u14.f, u15.f ); + union { + int i; + float f; + } u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, + u14, u15; + + u00.i = i00; + u01.i = i01; + u02.i = i02; + u03.i = i03; + u04.i = i04; + u05.i = i05; + u06.i = i06; + u07.i = i07; + u08.i = i08; + u09.i = i09; + u10.i = i10; + u11.i = i11; + u12.i = i12; + u13.i = i13; + u14.i = i14; + u15.i = i15; + + v = _mm512_setr_ps( u00.f, u01.f, u02.f, u03.f, u04.f, u05.f, u06.f, + u07.f, u08.f, u09.f, u10.f, u11.f, u12.f, u13.f, + u14.f, u15.f ); } - ~v16int() {} // Destructor + ~v16int() {} // Destructor // v16int assignment operators -# define ASSIGN(op) \ - inline v16int &operator op( const v16int &b ) \ - { \ - for( int j = 0; j < 16; j++ ) \ - i[j] op b.i[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16int& operator op( const v16int& b ) \ + { \ + for ( int j = 0; j < 16; j++ ) \ + i[j] op b.i[j]; \ + return *this; \ } - inline v16int &operator =( const v16int &b ) + inline v16int& operator=( const v16int& b ) { - v = b.v; - return *this; + v = b.v; + return *this; } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) - inline v16int &operator ^=( const v16int &b ) + inline v16int& operator^=( const v16int& b ) { - v = _mm512_xor_ps( v, b.v ); - return *this; + v = _mm512_xor_ps( v, b.v ); + return *this; } - inline v16int &operator &=( const v16int &b ) + inline v16int& operator&=( const v16int& b ) { - v = _mm512_and_ps( v, b.v ); - return *this; + v = _mm512_and_ps( v, b.v ); + return *this; } - inline v16int &operator |=( const v16int &b ) + inline v16int& operator|=( const v16int& b ) { - v = _mm512_or_ps( v, b.v ); - return *this; + v = _mm512_or_ps( v, b.v ); + return *this; } - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v16int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; - } - }; + inline int operator()( int n ) { return i[n]; } +}; - // v16int prefix unary operators +// v16int prefix unary operators -# define PREFIX_UNARY(op) \ - inline v16int operator op( const v16int & a ) \ - { \ - v16int b; \ - for( int j = 0; j < 16; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } +#define PREFIX_UNARY( op ) \ + inline v16int operator op( const v16int& a ) \ + { \ + v16int b; \ + for ( int j = 0; j < 16; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } - inline v16int operator +( const v16int & a ) - { +inline v16int operator+( const v16int& a ) +{ v16int b; b.v = a.v; return b; - } +} - PREFIX_UNARY(-) +PREFIX_UNARY( -) - inline v16int operator !( const v16int & a ) - { +inline v16int operator!( const v16int& a ) +{ v16int b; - for( int j = 0; j < 16; j++ ) - b.i[j] = - ( !a.i[j] ); + for ( int j = 0; j < 16; j++ ) + b.i[j] = -( !a.i[j] ); return b; - } +} - inline v16int operator ~( const v16int & a ) - { +inline v16int operator~( const v16int& a ) +{ v16int b; - union - { - int i; - float f; + union { + int i; + float f; } u; u.i = -1; b.v = _mm512_xor_ps( a.v, _mm512_set1_ps( u.f ) ); return b; - } +} -# undef PREFIX_UNARY +#undef PREFIX_UNARY - // v16int prefix increment / decrement +// v16int prefix increment / decrement -# define PREFIX_INCDEC(op) \ - inline v16int operator op( v16int & a ) \ - { \ - v16int b; \ - for( int j = 0; j < 16; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } +#define PREFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a ) \ + { \ + v16int b; \ + for ( int j = 0; j < 16; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) -# undef PREFIX_INCDEC +#undef PREFIX_INCDEC - // v16int postfix increment / decrement +// v16int postfix increment / decrement -# define POSTFIX_INCDEC(op) \ - inline v16int operator op( v16int & a, int ) \ - { \ - v16int b; \ - for( int j = 0; j < 16; j++ ) \ - b.i[j] = ( a.i[j] op ); \ - return b; \ - } +#define POSTFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a, int ) \ + { \ + v16int b; \ + for ( int j = 0; j < 16; j++ ) \ + b.i[j] = ( a.i[j] op ); \ + return b; \ + } - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) -# undef POSTFIX_INCDEC +#undef POSTFIX_INCDEC - // v16int binary operators +// v16int binary operators -# define BINARY(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - for( int j = 0; j < 16; j++ ) \ - c.i[j] = a.i[j] op b.i[j]; \ - return c; \ - } +#define BINARY( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + for ( int j = 0; j < 16; j++ ) \ + c.i[j] = a.i[j] op b.i[j]; \ + return c; \ + } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) - inline v16int operator ^( const v16int &a, const v16int &b ) - { +inline v16int operator^( const v16int& a, const v16int& b ) +{ v16int c; c.v = _mm512_xor_ps( a.v, b.v ); return c; - } +} - inline v16int operator &( const v16int &a, const v16int &b ) - { +inline v16int operator&( const v16int& a, const v16int& b ) +{ v16int c; c.v = _mm512_and_ps( a.v, b.v ); return c; - } +} - #if 0 +#if 0 inline v16int operator |( const v16int &a, const v16int &b ) { v16int c; @@ -2273,59 +3401,59 @@ namespace v16 return c; } - #endif +#endif - BINARY(|) - BINARY(<<) - BINARY(>>) +BINARY( | ) +BINARY( << ) +BINARY( >> ) - #undef BINARY +#undef BINARY - // v16int logical operators +// v16int logical operators -# define LOGICAL(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - for( int j = 0; j < 16; j++ ) \ - c.i[j] = - ( a.i[j] op b.i[j] ); \ - return c; \ - } +#define LOGICAL( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + for ( int j = 0; j < 16; j++ ) \ + c.i[j] = -( a.i[j] op b.i[j] ); \ + return c; \ + } - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) -# undef LOGICAL +#undef LOGICAL - // v16int miscellaneous functions +// v16int miscellaneous functions - inline v16int abs( const v16int &a ) - { +inline v16int abs( const v16int& a ) +{ v16int b; - for( int j = 0; j < 16; j++ ) - b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; return b; - } +} - inline v16 czero( const v16int &c, const v16 &a ) - { +inline v16 czero( const v16int& c, const v16& a ) +{ v16 b; - for( int j = 0; j < 16; j++ ) - b.i[j] = a.i[j] & ~c.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; return b; - } +} - #if 0 +#if 0 inline v16 czero( const v16int &c, const v16 &a ) { v16 b; @@ -2334,28 +3462,28 @@ namespace v16 return b; } - #endif +#endif - inline v16 notczero( const v16int &c, const v16 &a ) - { +inline v16 notczero( const v16int& c, const v16& a ) +{ v16 b; b.v = _mm512_and_ps( c.v, a.v ); return b; - } +} - inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) - { +inline v16 merge( const v16int& c, const v16& t, const v16& f ) +{ v16 m; - for( int j = 0; j < 16; j++ ) - m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + for ( int j = 0; j < 16; j++ ) + m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return m; - } +} - #if 0 +#if 0 inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) { __m512 c_v = c.v; @@ -2367,127 +3495,162 @@ namespace v16 return tf; } - #endif +#endif - //////////////// - // v16float class +//////////////// +// v16float class - class v16float : public v16 - { +class v16float : public v16 +{ // v16float prefix unary operator friends - friend inline v16float operator +( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator ~( const v16float &a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16float &a ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator~( const v16float& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16float prefix increment / decrement operator friends - friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a ) ALWAYS_INLINE; // v16float postfix increment / decrement operator friends - friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a, int ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a, int ) ALWAYS_INLINE; // v16float binary operator friends - friend inline v16float operator +( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator *( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator /( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator*(const v16float& a, + const v16float& b)ALWAYS_INLINE; + friend inline v16float operator/( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float math library friends -# define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v16float fn( const v16float &a, \ - const v16float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v16float fn( const v16float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v16float fn( const v16float& a, const v16float& b ) \ + ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v16float miscellaneous friends - friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rsqrt ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; + friend inline v16float rsqrt_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rsqrt( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp( const v16float& a ) ALWAYS_INLINE; + friend inline v16float fma( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fnms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline void increment_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void decrement_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void scale_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; public: - // v16float constructors / destructors - v16float() {} // Default constructor + v16float() {} // Default constructor - v16float( const v16float &a ) // Copy constructor + v16float( const v16float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v16float( const v16 &a ) // Init from mixed + v16float( const v16& a ) // Init from mixed { - v = a.v; + v = a.v; } - v16float( float a ) // Init from scalar + v16float( float a ) // Init from scalar { - v = _mm512_set1_ps( a ); + v = _mm512_set1_ps( a ); } - v16float( float f00, float f01, float f02, float f03, - float f04, float f05, float f06, float f07, - float f08, float f09, float f10, float f11, - float f12, float f13, float f14, float f15 ) // Init from scalars + v16float( float f00, float f01, float f02, float f03, float f04, float f05, + float f06, float f07, float f08, float f09, float f10, float f11, + float f12, float f13, float f14, float f15 ) // Init from scalars { - v = _mm512_setr_ps( f00, f01, f02, f03, f04, f05, f06, f07, - f08, f09, f10, f11, f12, f13, f14, f15 ); + v = _mm512_setr_ps( f00, f01, f02, f03, f04, f05, f06, f07, f08, f09, + f10, f11, f12, f13, f14, f15 ); } - ~v16float() {} // Destructor + ~v16float() {} // Destructor // v16float assignment operators -# define ASSIGN(op,intrin) \ - inline v16float &operator op( const v16float &b ) \ - { \ - v = intrin( v, b.v ); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v16float& operator op( const v16float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } - inline v16float &operator =( const v16float &b ) + inline v16float& operator=( const v16float& b ) { - v = b.v; - return *this; + v = b.v; + return *this; } ASSIGN( +=, _mm512_add_ps ) @@ -2495,52 +3658,46 @@ namespace v16 ASSIGN( *=, _mm512_mul_ps ) ASSIGN( /=, _mm512_div_ps ) -# undef ASSIGN +#undef ASSIGN // v16float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v16float prefix unary operators +// v16float prefix unary operators - inline v16float operator +( const v16float &a ) - { +inline v16float operator+( const v16float& a ) +{ v16float b; b.v = a.v; return b; - } +} - inline v16float operator -( const v16float &a ) - { +inline v16float operator-( const v16float& a ) +{ v16float b; b.v = _mm512_sub_ps( _mm512_setzero_ps(), a.v ); return b; - } +} - inline v16int operator !( const v16float &a ) - { +inline v16int operator!( const v16float& a ) +{ v16int b; - for( int j = 0; j < 16; j++ ) - b.i[j] = a.i[j] ? 0 : -1; + for ( int j = 0; j < 16; j++ ) + b.i[j] = a.i[j] ? 0 : -1; return b; - } +} - #if 0 +#if 0 inline v16int operator !( const v16float &a ) { v16int b; @@ -2549,12 +3706,12 @@ namespace v16 return b; } - #endif +#endif - // v16float prefix increment / decrement operators +// v16float prefix increment / decrement operators - inline v16float operator ++( v16float &a ) - { +inline v16float operator++( v16float& a ) +{ v16float b; __m512 t = _mm512_add_ps( a.v, _mm512_set1_ps( 1.0f ) ); @@ -2562,10 +3719,10 @@ namespace v16 b.v = t; return b; - } +} - inline v16float operator --( v16float &a ) - { +inline v16float operator--( v16float& a ) +{ v16float b; __m512 t = _mm512_sub_ps( a.v, _mm512_set1_ps( 1.0f ) ); @@ -2573,12 +3730,12 @@ namespace v16 b.v = t; return b; - } +} - // v16float postfix increment / decrement operators +// v16float postfix increment / decrement operators - inline v16float operator ++( v16float &a, int ) - { +inline v16float operator++( v16float& a, int ) +{ v16float b; __m512 a_v = a.v; @@ -2586,10 +3743,10 @@ namespace v16 b.v = a_v; return b; - } +} - inline v16float operator --( v16float &a, int ) - { +inline v16float operator--( v16float& a, int ) +{ v16float b; __m512 a_v = a.v; @@ -2597,55 +3754,55 @@ namespace v16 b.v = a_v; return b; - } +} - // v16float binary operators +// v16float binary operators -# define BINARY(op,intrin) \ - inline v16float operator op( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define BINARY( op, intrin ) \ + inline v16float operator op( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - BINARY( +, _mm512_add_ps ) - BINARY( -, _mm512_sub_ps ) - BINARY( *, _mm512_mul_ps ) - BINARY( /, _mm512_div_ps ) +BINARY( +, _mm512_add_ps ) +BINARY( -, _mm512_sub_ps ) +BINARY( *, _mm512_mul_ps ) +BINARY( /, _mm512_div_ps ) -# undef BINARY +#undef BINARY - // v16float logical operators +// v16float logical operators -# define LOGICAL(op) \ - inline v16int operator op( const v16float &a, const v16float &b ) \ - { \ - v16int c; \ - for( int j = 0; j < 16; j++ ) \ - c.i[j] = -( a.f[j] op b.f[j] ); \ - return c; \ - } +#define LOGICAL( op ) \ + inline v16int operator op( const v16float& a, const v16float& b ) \ + { \ + v16int c; \ + for ( int j = 0; j < 16; j++ ) \ + c.i[j] = -( a.f[j] op b.f[j] ); \ + return c; \ + } - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) -# undef LOGICAL +#undef LOGICAL #if 0 -# define LOGICAL(op,intrin,flag) \ - inline v16int operator op( const v16float &a, const v16float &b ) \ - { \ - v16int c; \ - c.v = intrin( a.v, b.v, flag ); \ - return c; \ - } +#define LOGICAL( op, intrin, flag ) \ + inline v16int operator op( const v16float& a, const v16float& b ) \ + { \ + v16int c; \ + c.v = intrin( a.v, b.v, flag ); \ + return c; \ + } LOGICAL( <, _mm512_cmp_ps, _CMP_LT_OS ) LOGICAL( >, _mm512_cmp_ps, _CMP_GT_OS ) @@ -2672,103 +3829,106 @@ namespace v16 return c; } -# undef LOGICAL +#undef LOGICAL #endif - // v16float math library functions +// v16float math library functions -# define CMATH_FR1(fn) \ - inline v16float fn( const v16float &a ) \ - { \ - v16float b; \ - for( int j = 0; j < 16; j++ ) \ - b.f[j] = ::fn( a.f[j] ); \ - return b; \ - } +#define CMATH_FR1( fn ) \ + inline v16float fn( const v16float& a ) \ + { \ + v16float b; \ + for ( int j = 0; j < 16; j++ ) \ + b.f[j] = ::fn( a.f[j] ); \ + return b; \ + } -# define CMATH_FR2(fn) \ - inline v16float fn( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - for( int j = 0; j < 16; j++ ) \ - c.f[j] = ::fn( a.f[j], b.f[j] ); \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v16float fn( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + for ( int j = 0; j < 16; j++ ) \ + c.f[j] = ::fn( a.f[j], b.f[j] ); \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) - inline v16float fabs( const v16float &a ) - { + inline v16float fabs( const v16float& a ) +{ v16float b; b.v = _mm512_andnot_ps( _mm512_set1_ps( -0.0f ), a.v ); return b; - } +} - inline v16float sqrt( const v16float &a ) - { +inline v16float sqrt( const v16float& a ) +{ v16float b; b.v = _mm512_sqrt_ps( a.v ); return b; - } +} - inline v16float copysign( const v16float &a, const v16float &b ) - { +inline v16float copysign( const v16float& a, const v16float& b ) +{ v16float c; __m512 t = _mm512_set1_ps( -0.0f ); c.v = _mm512_or_ps( _mm512_and_ps( t, b.v ), _mm512_andnot_ps( t, a.v ) ); return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v16float miscellaneous functions +// v16float miscellaneous functions - inline v16float rsqrt_approx( const v16float &a ) - { +inline v16float rsqrt_approx( const v16float& a ) +{ v16float b; - b.v = _mm512_rsqrt14_ps(a.v); + b.v = _mm512_rsqrt14_ps( a.v ); // b.v = _mm512_rsqrt28_ps(a.v); return b; - } +} - inline v16float rsqrt( const v16float &a ) - { +inline v16float rsqrt( const v16float& a ) +{ v16float b; __m512 a_v = a.v, b_v; // b_v = _mm512_rsqrt28_ps(a_v); - b_v = _mm512_rsqrt14_ps(a_v); + b_v = _mm512_rsqrt14_ps( a_v ); - b.v = _mm512_add_ps( b_v, _mm512_mul_ps( _mm512_set1_ps( 0.5f ), - _mm512_sub_ps( b_v, - _mm512_mul_ps( a_v, - _mm512_mul_ps( b_v, - _mm512_mul_ps( b_v, b_v ) ) ) ) ) ); + b.v = _mm512_add_ps( + b_v, _mm512_mul_ps( + _mm512_set1_ps( 0.5f ), + _mm512_sub_ps( + b_v, _mm512_mul_ps( + a_v, _mm512_mul_ps( + b_v, _mm512_mul_ps( b_v, b_v ) ) ) ) ) ); // Note: It is quicker to just call div_ps and sqrt_ps if more refinement // is desired. // b.v = _mm512_div_ps( _mm512_set1_ps( 1.0f ), _mm512_sqrt_ps( a.v ) ); return b; - } +} - inline v16float rcp_approx( const v16float &a ) - { +inline v16float rcp_approx( const v16float& a ) +{ v16float b; // b.v = _mm512_rcp28_ps( a.v ); @@ -2776,10 +3936,10 @@ namespace v16 b.v = _mm512_rcp14_ps( a.v ); return b; - } +} - inline v16float rcp( const v16float &a ) - { +inline v16float rcp( const v16float& a ) +{ v16float b; __m512 a_v = a.v, b_v; @@ -2788,81 +3948,81 @@ namespace v16 b_v = _mm512_rcp14_ps( a_v ); b.v = _mm512_sub_ps( _mm512_add_ps( b_v, b_v ), - _mm512_mul_ps( a_v, _mm512_mul_ps( b_v, b_v ) ) ); + _mm512_mul_ps( a_v, _mm512_mul_ps( b_v, b_v ) ) ); // b.v = _mm512_div_ps( _mm512_set1_ps( 1.0f ), a.v ); return b; - } +} - inline v16float fma( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fma( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; d.v = _mm512_fmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v16float fms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; d.v = _mm512_fmsub_ps( a.v, b.v, c.v ); return d; - } +} - inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fnms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; d.v = _mm512_fnmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v16float clear_bits( const v16int &m, const v16float &a ) - { +inline v16float clear_bits( const v16int& m, const v16float& a ) +{ v16float b; b.v = _mm512_andnot_ps( m.v, a.v ); return b; - } +} - inline v16float set_bits( const v16int &m, const v16float &a ) - { +inline v16float set_bits( const v16int& m, const v16float& a ) +{ v16float b; b.v = _mm512_or_ps( m.v, a.v ); return b; - } +} - inline v16float toggle_bits( const v16int &m, const v16float &a ) - { +inline v16float toggle_bits( const v16int& m, const v16float& a ) +{ v16float b; b.v = _mm512_xor_ps( m.v, a.v ); return b; - } +} - inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) - { +inline void increment_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ _mm512_store_ps( p, _mm512_add_ps( _mm512_load_ps( p ), a.v ) ); - } +} - inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) - { +inline void decrement_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ _mm512_store_ps( p, _mm512_sub_ps( _mm512_load_ps( p ), a.v ) ); - } +} - inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) - { +inline void scale_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ _mm512_store_ps( p, _mm512_mul_ps( _mm512_load_ps( p ), a.v ) ); - } +} } // namespace v16 diff --git a/src/util/v16/v16_portable.h b/src/util/v16/v16_portable.h index 084d1bb2..f69b8cda 100644 --- a/src/util/v16/v16_portable.h +++ b/src/util/v16/v16_portable.h @@ -11,384 +11,294 @@ #include #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v16 { - class v16; - class v16int; - class v16float; +class v16; +class v16int; +class v16float; - //////////////// - // v16 base class +//////////////// +// v16 base class - class v16 - { +class v16 +{ friend class v16int; friend class v16float; // v16 miscellaneous friends - friend inline int any( const v16 &a ) ALWAYS_INLINE; - friend inline int all( const v16 &a ) ALWAYS_INLINE; + friend inline int any( const v16& a ) ALWAYS_INLINE; + friend inline int all( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 splat( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 splat( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 shuffle( const v16& a ) ALWAYS_INLINE; - friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE; + friend inline void swap( v16& a, v16& b ) ALWAYS_INLINE; + friend inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, + v16& a04, v16& a05, v16& a06, v16& a07, + v16& a08, v16& a09, v16& a10, v16& a11, + v16& a12, v16& a13, v16& a14, + v16& a15 ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& a, + const v16& b ) ALWAYS_INLINE; // v16 memory manipulation friends - friend inline void load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE; - friend inline void store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE; - friend inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) ALWAYS_INLINE; - friend inline void swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE; + friend inline void load_16x1( const void* ALIGNED( 64 ) p, + v16& a ) ALWAYS_INLINE; + friend inline void store_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void stream_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void clear_16x1( void* ALIGNED( 64 ) dst ) ALWAYS_INLINE; + friend inline void copy_16x1( void* ALIGNED( 64 ) dst, + const void* ALIGNED( 64 ) src ) ALWAYS_INLINE; + friend inline void swap_16x1( void* ALIGNED( 64 ) a, + void* ALIGNED( 64 ) b ) ALWAYS_INLINE; // v16 transposed memory manipulation friends // Note: Half aligned values are permissible in the 16x2_tr variants. - friend inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) ALWAYS_INLINE; - friend inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE; - friend inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE; - friend inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - - friend inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE; - friend inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, - void * ALIGNED(8) a01, - void * ALIGNED(8) a02, - void * ALIGNED(8) a03, - void * ALIGNED(8) a04, - void * ALIGNED(8) a05, - void * ALIGNED(8) a06, - void * ALIGNED(8) a07, - void * ALIGNED(8) a08, - void * ALIGNED(8) a09, - void * ALIGNED(8) a10, - void * ALIGNED(8) a11, - void * ALIGNED(8) a12, - void * ALIGNED(8) a13, - void * ALIGNED(8) a14, - void * ALIGNED(8) a15 ) ALWAYS_INLINE; - friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x4_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x16_tr( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr_p( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) ALWAYS_INLINE; - friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; + friend inline void + load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) ALWAYS_INLINE; + friend inline void + load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) ALWAYS_INLINE; + friend inline void + load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) ALWAYS_INLINE; + friend inline void + load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) ALWAYS_INLINE; + friend inline void + load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, + v16& h ) ALWAYS_INLINE; + friend inline void + load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) ALWAYS_INLINE; + friend inline void load_16x8_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, v16& a, + v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) ALWAYS_INLINE; + friend inline void load_16x16_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, v16& b00, + v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, v16& b06, v16& b07, + v16& b08, v16& b09, v16& b10, v16& b11, v16& b12, v16& b13, v16& b14, + v16& b15 ) ALWAYS_INLINE; + + friend inline void store_16x1_tr( const v16& a, void* a00, void* a01, + void* a02, void* a03, void* a04, + void* a05, void* a06, void* a07, + void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, + void* a14, void* a15 ) ALWAYS_INLINE; + friend inline void store_16x2_tr( + const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, void* ALIGNED( 8 ) a03, + void* ALIGNED( 8 ) a04, void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, void* ALIGNED( 8 ) a09, + void* ALIGNED( 8 ) a10, void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x4_tr( const v16& a, const v16& b, const v16& c, const v16& d, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr_p( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07 ) ALWAYS_INLINE; + friend inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; protected: - - union - { - int i[16]; - float f[16]; + union { + int i[16]; + float f[16]; }; public: + v16() {} // Default constructor - v16() {} // Default constructor - - v16( const v16 &a ) // Copy constructor + v16( const v16& a ) // Copy constructor { - i[ 0]=a.i[ 0]; i[ 1]=a.i[ 1]; i[ 2]=a.i[ 2]; i[ 3]=a.i[ 3]; - i[ 4]=a.i[ 4]; i[ 5]=a.i[ 5]; i[ 6]=a.i[ 6]; i[ 7]=a.i[ 7]; - i[ 8]=a.i[ 8]; i[ 9]=a.i[ 9]; i[10]=a.i[10]; i[11]=a.i[11]; - i[12]=a.i[12]; i[13]=a.i[13]; i[14]=a.i[14]; i[15]=a.i[15]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; + i[8] = a.i[8]; + i[9] = a.i[9]; + i[10] = a.i[10]; + i[11] = a.i[11]; + i[12] = a.i[12]; + i[13] = a.i[13]; + i[14] = a.i[14]; + i[15] = a.i[15]; } - ~v16() {} // Default destructor - }; - - // v16 miscellaneous functions - - inline int any( const v16 &a ) - { - return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] || - a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] || - a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] || - a.i[12] || a.i[13] || a.i[14] || a.i[15]; - } - - inline int all( const v16 &a ) - { - return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] && - a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] && - a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] && - a.i[12] && a.i[13] && a.i[14] && a.i[15]; - } - - template - inline v16 splat( const v16 & a ) - { + ~v16() {} // Default destructor +}; + +// v16 miscellaneous functions + +inline int any( const v16& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7] || a.i[8] || a.i[9] || a.i[10] || a.i[11] || a.i[12] || + a.i[13] || a.i[14] || a.i[15]; +} + +inline int all( const v16& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7] && a.i[8] && a.i[9] && a.i[10] && a.i[11] && a.i[12] && + a.i[13] && a.i[14] && a.i[15]; +} + +template +inline v16 splat( const v16& a ) +{ v16 b; - b.i[ 0] = a.i[n]; - b.i[ 1] = a.i[n]; - b.i[ 2] = a.i[n]; - b.i[ 3] = a.i[n]; - b.i[ 4] = a.i[n]; - b.i[ 5] = a.i[n]; - b.i[ 6] = a.i[n]; - b.i[ 7] = a.i[n]; - b.i[ 8] = a.i[n]; - b.i[ 9] = a.i[n]; + b.i[0] = a.i[n]; + b.i[1] = a.i[n]; + b.i[2] = a.i[n]; + b.i[3] = a.i[n]; + b.i[4] = a.i[n]; + b.i[5] = a.i[n]; + b.i[6] = a.i[n]; + b.i[7] = a.i[n]; + b.i[8] = a.i[n]; + b.i[9] = a.i[n]; b.i[10] = a.i[n]; b.i[11] = a.i[n]; b.i[12] = a.i[n]; @@ -397,23 +307,25 @@ namespace v16 b.i[15] = a.i[n]; return b; - } +} - template - inline v16 shuffle( const v16 & a ) - { +template +inline v16 shuffle( const v16& a ) +{ v16 b; - b.i[ 0] = a.i[i00]; - b.i[ 1] = a.i[i01]; - b.i[ 2] = a.i[i02]; - b.i[ 3] = a.i[i03]; - b.i[ 4] = a.i[i04]; - b.i[ 5] = a.i[i05]; - b.i[ 6] = a.i[i06]; - b.i[ 7] = a.i[i07]; - b.i[ 8] = a.i[i08]; - b.i[ 9] = a.i[i09]; + b.i[0] = a.i[i00]; + b.i[1] = a.i[i01]; + b.i[2] = a.i[i02]; + b.i[3] = a.i[i03]; + b.i[4] = a.i[i04]; + b.i[5] = a.i[i05]; + b.i[6] = a.i[i06]; + b.i[7] = a.i[i07]; + b.i[8] = a.i[i08]; + b.i[9] = a.i[i09]; b.i[10] = a.i[i10]; b.i[11] = a.i[i11]; b.i[12] = a.i[i12]; @@ -422,2939 +334,3046 @@ namespace v16 b.i[15] = a.i[i15]; return b; - } - -# define sw(x,y) x^=y, y^=x, x^=y - - inline void swap( v16 &a, v16 &b ) - { - sw( a.i[ 0], b.i[ 0] ); - sw( a.i[ 1], b.i[ 1] ); - sw( a.i[ 2], b.i[ 2] ); - sw( a.i[ 3], b.i[ 3] ); - sw( a.i[ 4], b.i[ 4] ); - sw( a.i[ 5], b.i[ 5] ); - sw( a.i[ 6], b.i[ 6] ); - sw( a.i[ 7], b.i[ 7] ); - sw( a.i[ 8], b.i[ 8] ); - sw( a.i[ 9], b.i[ 9] ); +} + +#define sw( x, y ) x ^= y, y ^= x, x ^= y + +inline void swap( v16& a, v16& b ) +{ + sw( a.i[0], b.i[0] ); + sw( a.i[1], b.i[1] ); + sw( a.i[2], b.i[2] ); + sw( a.i[3], b.i[3] ); + sw( a.i[4], b.i[4] ); + sw( a.i[5], b.i[5] ); + sw( a.i[6], b.i[6] ); + sw( a.i[7], b.i[7] ); + sw( a.i[8], b.i[8] ); + sw( a.i[9], b.i[9] ); sw( a.i[10], b.i[10] ); sw( a.i[11], b.i[11] ); sw( a.i[12], b.i[12] ); sw( a.i[13], b.i[13] ); sw( a.i[14], b.i[14] ); sw( a.i[15], b.i[15] ); - } - - inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) - { - sw( a00.i[1],a01.i[0] ); sw( a00.i[2],a02.i[0] ); sw( a00.i[3],a03.i[0] ); sw( a00.i[4],a04.i[0] ); sw( a00.i[5],a05.i[0] ); sw( a00.i[6],a06.i[0] ); sw( a00.i[7],a07.i[0] ); sw( a00.i[8],a08.i[0] ); sw( a00.i[9],a09.i[0] ); sw( a00.i[10],a10.i[0] ); sw( a00.i[11],a11.i[ 0] ); sw( a00.i[12],a12.i[ 0] ); sw( a00.i[13],a13.i[ 0] ); sw( a00.i[14],a14.i[ 0] ); sw( a00.i[15],a15.i[ 0] ); - sw( a01.i[2],a02.i[1] ); sw( a01.i[3],a03.i[1] ); sw( a01.i[4],a04.i[1] ); sw( a01.i[5],a05.i[1] ); sw( a01.i[6],a06.i[1] ); sw( a01.i[7],a07.i[1] ); sw( a01.i[8],a08.i[1] ); sw( a01.i[9],a09.i[1] ); sw( a01.i[10],a10.i[1] ); sw( a01.i[11],a11.i[ 1] ); sw( a01.i[12],a12.i[ 1] ); sw( a01.i[13],a13.i[ 1] ); sw( a01.i[14],a14.i[ 1] ); sw( a01.i[15],a15.i[ 1] ); - sw( a02.i[3],a03.i[2] ); sw( a02.i[4],a04.i[2] ); sw( a02.i[5],a05.i[2] ); sw( a02.i[6],a06.i[2] ); sw( a02.i[7],a07.i[2] ); sw( a02.i[8],a08.i[2] ); sw( a02.i[9],a09.i[2] ); sw( a02.i[10],a10.i[2] ); sw( a02.i[11],a11.i[ 2] ); sw( a02.i[12],a12.i[ 2] ); sw( a02.i[13],a13.i[ 2] ); sw( a02.i[14],a14.i[ 2] ); sw( a02.i[15],a15.i[ 2] ); - sw( a03.i[4],a04.i[3] ); sw( a03.i[5],a05.i[3] ); sw( a03.i[6],a06.i[3] ); sw( a03.i[7],a07.i[3] ); sw( a03.i[8],a08.i[3] ); sw( a03.i[9],a09.i[3] ); sw( a03.i[10],a10.i[3] ); sw( a03.i[11],a11.i[ 3] ); sw( a03.i[12],a12.i[ 3] ); sw( a03.i[13],a13.i[ 3] ); sw( a03.i[14],a14.i[ 3] ); sw( a03.i[15],a15.i[ 3] ); - sw( a04.i[5],a05.i[4] ); sw( a04.i[6],a06.i[4] ); sw( a04.i[7],a07.i[4] ); sw( a04.i[8],a08.i[4] ); sw( a04.i[9],a09.i[4] ); sw( a04.i[10],a10.i[4] ); sw( a04.i[11],a11.i[ 4] ); sw( a04.i[12],a12.i[ 4] ); sw( a04.i[13],a13.i[ 4] ); sw( a04.i[14],a14.i[ 4] ); sw( a04.i[15],a15.i[ 4] ); - sw( a05.i[6],a06.i[5] ); sw( a05.i[7],a07.i[5] ); sw( a05.i[8],a08.i[5] ); sw( a05.i[9],a09.i[5] ); sw( a05.i[10],a10.i[5] ); sw( a05.i[11],a11.i[ 5] ); sw( a05.i[12],a12.i[ 5] ); sw( a05.i[13],a13.i[ 5] ); sw( a05.i[14],a14.i[ 5] ); sw( a05.i[15],a15.i[ 5] ); - sw( a06.i[7],a07.i[6] ); sw( a06.i[8],a08.i[6] ); sw( a06.i[9],a09.i[6] ); sw( a06.i[10],a10.i[6] ); sw( a06.i[11],a11.i[ 6] ); sw( a06.i[12],a12.i[ 6] ); sw( a06.i[13],a13.i[ 6] ); sw( a06.i[14],a14.i[ 6] ); sw( a06.i[15],a15.i[ 6] ); - sw( a07.i[8],a08.i[7] ); sw( a07.i[9],a09.i[7] ); sw( a07.i[10],a10.i[7] ); sw( a07.i[11],a11.i[ 7] ); sw( a07.i[12],a12.i[ 7] ); sw( a07.i[13],a13.i[ 7] ); sw( a07.i[14],a14.i[ 7] ); sw( a07.i[15],a15.i[ 7] ); - sw( a08.i[9],a09.i[8] ); sw( a08.i[10],a10.i[8] ); sw( a08.i[11],a11.i[ 8] ); sw( a08.i[12],a12.i[ 8] ); sw( a08.i[13],a13.i[ 8] ); sw( a08.i[14],a14.i[ 8] ); sw( a08.i[15],a15.i[ 8] ); - sw( a09.i[10],a10.i[9] ); sw( a09.i[11],a11.i[ 9] ); sw( a09.i[12],a12.i[ 9] ); sw( a09.i[13],a13.i[ 9] ); sw( a09.i[14],a14.i[ 9] ); sw( a09.i[15],a15.i[ 9] ); - sw( a10.i[11],a11.i[10] ); sw( a10.i[12],a12.i[10] ); sw( a10.i[13],a13.i[10] ); sw( a10.i[14],a14.i[10] ); sw( a10.i[15],a15.i[10] ); - sw( a11.i[12],a12.i[11] ); sw( a11.i[13],a13.i[11] ); sw( a11.i[14],a14.i[11] ); sw( a11.i[15],a15.i[11] ); - sw( a12.i[13],a13.i[12] ); sw( a12.i[14],a14.i[12] ); sw( a12.i[15],a15.i[12] ); - sw( a13.i[14],a14.i[13] ); sw( a13.i[15],a15.i[13] ); - sw( a14.i[15],a15.i[14] ); - } - -# undef sw - - // v16 memory manipulation functions - - inline void load_16x1( const void * ALIGNED(64) p, - v16 &a ) - { - a.i[ 0] = ((const int * ALIGNED(64))p)[ 0]; - a.i[ 1] = ((const int * ALIGNED(64))p)[ 1]; - a.i[ 2] = ((const int * ALIGNED(64))p)[ 2]; - a.i[ 3] = ((const int * ALIGNED(64))p)[ 3]; - a.i[ 4] = ((const int * ALIGNED(64))p)[ 4]; - a.i[ 5] = ((const int * ALIGNED(64))p)[ 5]; - a.i[ 6] = ((const int * ALIGNED(64))p)[ 6]; - a.i[ 7] = ((const int * ALIGNED(64))p)[ 7]; - a.i[ 8] = ((const int * ALIGNED(64))p)[ 8]; - a.i[ 9] = ((const int * ALIGNED(64))p)[ 9]; - a.i[10] = ((const int * ALIGNED(64))p)[10]; - a.i[11] = ((const int * ALIGNED(64))p)[11]; - a.i[12] = ((const int * ALIGNED(64))p)[12]; - a.i[13] = ((const int * ALIGNED(64))p)[13]; - a.i[14] = ((const int * ALIGNED(64))p)[14]; - a.i[15] = ((const int * ALIGNED(64))p)[15]; - } - - inline void store_16x1( const v16 &a, - void * ALIGNED(64) p ) - { - ((int * ALIGNED(64))p)[ 0] = a.i[ 0]; - ((int * ALIGNED(64))p)[ 1] = a.i[ 1]; - ((int * ALIGNED(64))p)[ 2] = a.i[ 2]; - ((int * ALIGNED(64))p)[ 3] = a.i[ 3]; - ((int * ALIGNED(64))p)[ 4] = a.i[ 4]; - ((int * ALIGNED(64))p)[ 5] = a.i[ 5]; - ((int * ALIGNED(64))p)[ 6] = a.i[ 6]; - ((int * ALIGNED(64))p)[ 7] = a.i[ 7]; - ((int * ALIGNED(64))p)[ 8] = a.i[ 8]; - ((int * ALIGNED(64))p)[ 9] = a.i[ 9]; - ((int * ALIGNED(64))p)[10] = a.i[10]; - ((int * ALIGNED(64))p)[11] = a.i[11]; - ((int * ALIGNED(64))p)[12] = a.i[12]; - ((int * ALIGNED(64))p)[13] = a.i[13]; - ((int * ALIGNED(64))p)[14] = a.i[14]; - ((int * ALIGNED(64))p)[15] = a.i[15]; - } - - inline void stream_16x1( const v16 &a, - void * ALIGNED(64) p ) - { - ((int * ALIGNED(64))p)[ 0] = a.i[ 0]; - ((int * ALIGNED(64))p)[ 1] = a.i[ 1]; - ((int * ALIGNED(64))p)[ 2] = a.i[ 2]; - ((int * ALIGNED(64))p)[ 3] = a.i[ 3]; - ((int * ALIGNED(64))p)[ 4] = a.i[ 4]; - ((int * ALIGNED(64))p)[ 5] = a.i[ 5]; - ((int * ALIGNED(64))p)[ 6] = a.i[ 6]; - ((int * ALIGNED(64))p)[ 7] = a.i[ 7]; - ((int * ALIGNED(64))p)[ 8] = a.i[ 8]; - ((int * ALIGNED(64))p)[ 9] = a.i[ 9]; - ((int * ALIGNED(64))p)[10] = a.i[10]; - ((int * ALIGNED(64))p)[11] = a.i[11]; - ((int * ALIGNED(64))p)[12] = a.i[12]; - ((int * ALIGNED(64))p)[13] = a.i[13]; - ((int * ALIGNED(64))p)[14] = a.i[14]; - ((int * ALIGNED(64))p)[15] = a.i[15]; - } - - inline void clear_16x1( void * ALIGNED(64) p ) - { - ((int * ALIGNED(64))p)[ 0] = 0; - ((int * ALIGNED(64))p)[ 1] = 0; - ((int * ALIGNED(64))p)[ 2] = 0; - ((int * ALIGNED(64))p)[ 3] = 0; - ((int * ALIGNED(64))p)[ 4] = 0; - ((int * ALIGNED(64))p)[ 5] = 0; - ((int * ALIGNED(64))p)[ 6] = 0; - ((int * ALIGNED(64))p)[ 7] = 0; - ((int * ALIGNED(64))p)[ 8] = 0; - ((int * ALIGNED(64))p)[ 9] = 0; - ((int * ALIGNED(64))p)[10] = 0; - ((int * ALIGNED(64))p)[11] = 0; - ((int * ALIGNED(64))p)[12] = 0; - ((int * ALIGNED(64))p)[13] = 0; - ((int * ALIGNED(64))p)[14] = 0; - ((int * ALIGNED(64))p)[15] = 0; - } - - // FIXME: Ordering semantics - inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) - { - ((int * ALIGNED(64))dst)[ 0] = ((const int * ALIGNED(64))src)[ 0]; - ((int * ALIGNED(64))dst)[ 1] = ((const int * ALIGNED(64))src)[ 1]; - ((int * ALIGNED(64))dst)[ 2] = ((const int * ALIGNED(64))src)[ 2]; - ((int * ALIGNED(64))dst)[ 3] = ((const int * ALIGNED(64))src)[ 3]; - ((int * ALIGNED(64))dst)[ 4] = ((const int * ALIGNED(64))src)[ 4]; - ((int * ALIGNED(64))dst)[ 5] = ((const int * ALIGNED(64))src)[ 5]; - ((int * ALIGNED(64))dst)[ 6] = ((const int * ALIGNED(64))src)[ 6]; - ((int * ALIGNED(64))dst)[ 7] = ((const int * ALIGNED(64))src)[ 7]; - ((int * ALIGNED(64))dst)[ 8] = ((const int * ALIGNED(64))src)[ 8]; - ((int * ALIGNED(64))dst)[ 9] = ((const int * ALIGNED(64))src)[ 9]; - ((int * ALIGNED(64))dst)[10] = ((const int * ALIGNED(64))src)[10]; - ((int * ALIGNED(64))dst)[11] = ((const int * ALIGNED(64))src)[11]; - ((int * ALIGNED(64))dst)[12] = ((const int * ALIGNED(64))src)[12]; - ((int * ALIGNED(64))dst)[13] = ((const int * ALIGNED(64))src)[13]; - ((int * ALIGNED(64))dst)[14] = ((const int * ALIGNED(64))src)[14]; - ((int * ALIGNED(64))dst)[15] = ((const int * ALIGNED(64))src)[15]; - } - - inline void swap_16x1( void * ALIGNED(64) a, - void * ALIGNED(64) b ) - { +} + +inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, v16& a04, + v16& a05, v16& a06, v16& a07, v16& a08, v16& a09, + v16& a10, v16& a11, v16& a12, v16& a13, v16& a14, + v16& a15 ) +{ + sw( a00.i[1], a01.i[0] ); + sw( a00.i[2], a02.i[0] ); + sw( a00.i[3], a03.i[0] ); + sw( a00.i[4], a04.i[0] ); + sw( a00.i[5], a05.i[0] ); + sw( a00.i[6], a06.i[0] ); + sw( a00.i[7], a07.i[0] ); + sw( a00.i[8], a08.i[0] ); + sw( a00.i[9], a09.i[0] ); + sw( a00.i[10], a10.i[0] ); + sw( a00.i[11], a11.i[0] ); + sw( a00.i[12], a12.i[0] ); + sw( a00.i[13], a13.i[0] ); + sw( a00.i[14], a14.i[0] ); + sw( a00.i[15], a15.i[0] ); + sw( a01.i[2], a02.i[1] ); + sw( a01.i[3], a03.i[1] ); + sw( a01.i[4], a04.i[1] ); + sw( a01.i[5], a05.i[1] ); + sw( a01.i[6], a06.i[1] ); + sw( a01.i[7], a07.i[1] ); + sw( a01.i[8], a08.i[1] ); + sw( a01.i[9], a09.i[1] ); + sw( a01.i[10], a10.i[1] ); + sw( a01.i[11], a11.i[1] ); + sw( a01.i[12], a12.i[1] ); + sw( a01.i[13], a13.i[1] ); + sw( a01.i[14], a14.i[1] ); + sw( a01.i[15], a15.i[1] ); + sw( a02.i[3], a03.i[2] ); + sw( a02.i[4], a04.i[2] ); + sw( a02.i[5], a05.i[2] ); + sw( a02.i[6], a06.i[2] ); + sw( a02.i[7], a07.i[2] ); + sw( a02.i[8], a08.i[2] ); + sw( a02.i[9], a09.i[2] ); + sw( a02.i[10], a10.i[2] ); + sw( a02.i[11], a11.i[2] ); + sw( a02.i[12], a12.i[2] ); + sw( a02.i[13], a13.i[2] ); + sw( a02.i[14], a14.i[2] ); + sw( a02.i[15], a15.i[2] ); + sw( a03.i[4], a04.i[3] ); + sw( a03.i[5], a05.i[3] ); + sw( a03.i[6], a06.i[3] ); + sw( a03.i[7], a07.i[3] ); + sw( a03.i[8], a08.i[3] ); + sw( a03.i[9], a09.i[3] ); + sw( a03.i[10], a10.i[3] ); + sw( a03.i[11], a11.i[3] ); + sw( a03.i[12], a12.i[3] ); + sw( a03.i[13], a13.i[3] ); + sw( a03.i[14], a14.i[3] ); + sw( a03.i[15], a15.i[3] ); + sw( a04.i[5], a05.i[4] ); + sw( a04.i[6], a06.i[4] ); + sw( a04.i[7], a07.i[4] ); + sw( a04.i[8], a08.i[4] ); + sw( a04.i[9], a09.i[4] ); + sw( a04.i[10], a10.i[4] ); + sw( a04.i[11], a11.i[4] ); + sw( a04.i[12], a12.i[4] ); + sw( a04.i[13], a13.i[4] ); + sw( a04.i[14], a14.i[4] ); + sw( a04.i[15], a15.i[4] ); + sw( a05.i[6], a06.i[5] ); + sw( a05.i[7], a07.i[5] ); + sw( a05.i[8], a08.i[5] ); + sw( a05.i[9], a09.i[5] ); + sw( a05.i[10], a10.i[5] ); + sw( a05.i[11], a11.i[5] ); + sw( a05.i[12], a12.i[5] ); + sw( a05.i[13], a13.i[5] ); + sw( a05.i[14], a14.i[5] ); + sw( a05.i[15], a15.i[5] ); + sw( a06.i[7], a07.i[6] ); + sw( a06.i[8], a08.i[6] ); + sw( a06.i[9], a09.i[6] ); + sw( a06.i[10], a10.i[6] ); + sw( a06.i[11], a11.i[6] ); + sw( a06.i[12], a12.i[6] ); + sw( a06.i[13], a13.i[6] ); + sw( a06.i[14], a14.i[6] ); + sw( a06.i[15], a15.i[6] ); + sw( a07.i[8], a08.i[7] ); + sw( a07.i[9], a09.i[7] ); + sw( a07.i[10], a10.i[7] ); + sw( a07.i[11], a11.i[7] ); + sw( a07.i[12], a12.i[7] ); + sw( a07.i[13], a13.i[7] ); + sw( a07.i[14], a14.i[7] ); + sw( a07.i[15], a15.i[7] ); + sw( a08.i[9], a09.i[8] ); + sw( a08.i[10], a10.i[8] ); + sw( a08.i[11], a11.i[8] ); + sw( a08.i[12], a12.i[8] ); + sw( a08.i[13], a13.i[8] ); + sw( a08.i[14], a14.i[8] ); + sw( a08.i[15], a15.i[8] ); + sw( a09.i[10], a10.i[9] ); + sw( a09.i[11], a11.i[9] ); + sw( a09.i[12], a12.i[9] ); + sw( a09.i[13], a13.i[9] ); + sw( a09.i[14], a14.i[9] ); + sw( a09.i[15], a15.i[9] ); + sw( a10.i[11], a11.i[10] ); + sw( a10.i[12], a12.i[10] ); + sw( a10.i[13], a13.i[10] ); + sw( a10.i[14], a14.i[10] ); + sw( a10.i[15], a15.i[10] ); + sw( a11.i[12], a12.i[11] ); + sw( a11.i[13], a13.i[11] ); + sw( a11.i[14], a14.i[11] ); + sw( a11.i[15], a15.i[11] ); + sw( a12.i[13], a13.i[12] ); + sw( a12.i[14], a14.i[12] ); + sw( a12.i[15], a15.i[12] ); + sw( a13.i[14], a14.i[13] ); + sw( a13.i[15], a15.i[13] ); + sw( a14.i[15], a15.i[14] ); +} + +#undef sw + +// v16 memory manipulation functions + +inline void load_16x1( const void* ALIGNED( 64 ) p, v16& a ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 64 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 64 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 64 ))p )[3]; + a.i[4] = ( (const int* ALIGNED( 64 ))p )[4]; + a.i[5] = ( (const int* ALIGNED( 64 ))p )[5]; + a.i[6] = ( (const int* ALIGNED( 64 ))p )[6]; + a.i[7] = ( (const int* ALIGNED( 64 ))p )[7]; + a.i[8] = ( (const int* ALIGNED( 64 ))p )[8]; + a.i[9] = ( (const int* ALIGNED( 64 ))p )[9]; + a.i[10] = ( (const int* ALIGNED( 64 ))p )[10]; + a.i[11] = ( (const int* ALIGNED( 64 ))p )[11]; + a.i[12] = ( (const int* ALIGNED( 64 ))p )[12]; + a.i[13] = ( (const int* ALIGNED( 64 ))p )[13]; + a.i[14] = ( (const int* ALIGNED( 64 ))p )[14]; + a.i[15] = ( (const int* ALIGNED( 64 ))p )[15]; +} + +inline void store_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ + ( (int* ALIGNED( 64 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 64 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 64 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 64 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 64 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 64 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 64 ))p )[7] = a.i[7]; + ( (int* ALIGNED( 64 ))p )[8] = a.i[8]; + ( (int* ALIGNED( 64 ))p )[9] = a.i[9]; + ( (int* ALIGNED( 64 ))p )[10] = a.i[10]; + ( (int* ALIGNED( 64 ))p )[11] = a.i[11]; + ( (int* ALIGNED( 64 ))p )[12] = a.i[12]; + ( (int* ALIGNED( 64 ))p )[13] = a.i[13]; + ( (int* ALIGNED( 64 ))p )[14] = a.i[14]; + ( (int* ALIGNED( 64 ))p )[15] = a.i[15]; +} + +inline void stream_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ + ( (int* ALIGNED( 64 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 64 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 64 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 64 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 64 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 64 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 64 ))p )[7] = a.i[7]; + ( (int* ALIGNED( 64 ))p )[8] = a.i[8]; + ( (int* ALIGNED( 64 ))p )[9] = a.i[9]; + ( (int* ALIGNED( 64 ))p )[10] = a.i[10]; + ( (int* ALIGNED( 64 ))p )[11] = a.i[11]; + ( (int* ALIGNED( 64 ))p )[12] = a.i[12]; + ( (int* ALIGNED( 64 ))p )[13] = a.i[13]; + ( (int* ALIGNED( 64 ))p )[14] = a.i[14]; + ( (int* ALIGNED( 64 ))p )[15] = a.i[15]; +} + +inline void clear_16x1( void* ALIGNED( 64 ) p ) +{ + ( (int* ALIGNED( 64 ))p )[0] = 0; + ( (int* ALIGNED( 64 ))p )[1] = 0; + ( (int* ALIGNED( 64 ))p )[2] = 0; + ( (int* ALIGNED( 64 ))p )[3] = 0; + ( (int* ALIGNED( 64 ))p )[4] = 0; + ( (int* ALIGNED( 64 ))p )[5] = 0; + ( (int* ALIGNED( 64 ))p )[6] = 0; + ( (int* ALIGNED( 64 ))p )[7] = 0; + ( (int* ALIGNED( 64 ))p )[8] = 0; + ( (int* ALIGNED( 64 ))p )[9] = 0; + ( (int* ALIGNED( 64 ))p )[10] = 0; + ( (int* ALIGNED( 64 ))p )[11] = 0; + ( (int* ALIGNED( 64 ))p )[12] = 0; + ( (int* ALIGNED( 64 ))p )[13] = 0; + ( (int* ALIGNED( 64 ))p )[14] = 0; + ( (int* ALIGNED( 64 ))p )[15] = 0; +} + +// FIXME: Ordering semantics +inline void copy_16x1( void* ALIGNED( 64 ) dst, const void* ALIGNED( 64 ) src ) +{ + ( (int* ALIGNED( 64 ))dst )[0] = ( (const int* ALIGNED( 64 ))src )[0]; + ( (int* ALIGNED( 64 ))dst )[1] = ( (const int* ALIGNED( 64 ))src )[1]; + ( (int* ALIGNED( 64 ))dst )[2] = ( (const int* ALIGNED( 64 ))src )[2]; + ( (int* ALIGNED( 64 ))dst )[3] = ( (const int* ALIGNED( 64 ))src )[3]; + ( (int* ALIGNED( 64 ))dst )[4] = ( (const int* ALIGNED( 64 ))src )[4]; + ( (int* ALIGNED( 64 ))dst )[5] = ( (const int* ALIGNED( 64 ))src )[5]; + ( (int* ALIGNED( 64 ))dst )[6] = ( (const int* ALIGNED( 64 ))src )[6]; + ( (int* ALIGNED( 64 ))dst )[7] = ( (const int* ALIGNED( 64 ))src )[7]; + ( (int* ALIGNED( 64 ))dst )[8] = ( (const int* ALIGNED( 64 ))src )[8]; + ( (int* ALIGNED( 64 ))dst )[9] = ( (const int* ALIGNED( 64 ))src )[9]; + ( (int* ALIGNED( 64 ))dst )[10] = ( (const int* ALIGNED( 64 ))src )[10]; + ( (int* ALIGNED( 64 ))dst )[11] = ( (const int* ALIGNED( 64 ))src )[11]; + ( (int* ALIGNED( 64 ))dst )[12] = ( (const int* ALIGNED( 64 ))src )[12]; + ( (int* ALIGNED( 64 ))dst )[13] = ( (const int* ALIGNED( 64 ))src )[13]; + ( (int* ALIGNED( 64 ))dst )[14] = ( (const int* ALIGNED( 64 ))src )[14]; + ( (int* ALIGNED( 64 ))dst )[15] = ( (const int* ALIGNED( 64 ))src )[15]; +} + +inline void swap_16x1( void* ALIGNED( 64 ) a, void* ALIGNED( 64 ) b ) +{ int t; - t = ((int * ALIGNED(64))a)[ 0]; - ((int * ALIGNED(64))a)[ 0] = ((int * ALIGNED(64))b)[ 0]; - ((int * ALIGNED(64))b)[ 0] = t; - - t = ((int * ALIGNED(64))a)[ 1]; - ((int * ALIGNED(64))a)[ 1] = ((int * ALIGNED(64))b)[ 1]; - ((int * ALIGNED(64))b)[ 1] = t; - - t = ((int * ALIGNED(64))a)[ 2]; - ((int * ALIGNED(64))a)[ 2] = ((int * ALIGNED(64))b)[ 2]; - ((int * ALIGNED(64))b)[ 2] = t; - - t = ((int * ALIGNED(64))a)[ 3]; - ((int * ALIGNED(64))a)[ 3] = ((int * ALIGNED(64))b)[ 3]; - ((int * ALIGNED(64))b)[ 3] = t; - - t = ((int * ALIGNED(64))a)[ 4]; - ((int * ALIGNED(64))a)[ 4] = ((int * ALIGNED(64))b)[ 4]; - ((int * ALIGNED(64))b)[ 4] = t; - - t = ((int * ALIGNED(64))a)[ 5]; - ((int * ALIGNED(64))a)[ 5] = ((int * ALIGNED(64))b)[ 5]; - ((int * ALIGNED(64))b)[ 5] = t; - - t = ((int * ALIGNED(64))a)[ 6]; - ((int * ALIGNED(64))a)[ 6] = ((int * ALIGNED(64))b)[ 6]; - ((int * ALIGNED(64))b)[ 6] = t; - - t = ((int * ALIGNED(64))a)[ 7]; - ((int * ALIGNED(64))a)[ 7] = ((int * ALIGNED(64))b)[ 7]; - ((int * ALIGNED(64))b)[ 7] = t; - - t = ((int * ALIGNED(64))a)[ 8]; - ((int * ALIGNED(64))a)[ 8] = ((int * ALIGNED(64))b)[ 8]; - ((int * ALIGNED(64))b)[ 8] = t; - - t = ((int * ALIGNED(64))a)[ 9]; - ((int * ALIGNED(64))a)[ 9] = ((int * ALIGNED(64))b)[ 9]; - ((int * ALIGNED(64))b)[ 9] = t; - - t = ((int * ALIGNED(64))a)[10]; - ((int * ALIGNED(64))a)[10] = ((int * ALIGNED(64))b)[10]; - ((int * ALIGNED(64))b)[10] = t; - - t = ((int * ALIGNED(64))a)[11]; - ((int * ALIGNED(64))a)[11] = ((int * ALIGNED(64))b)[11]; - ((int * ALIGNED(64))b)[11] = t; - - t = ((int * ALIGNED(64))a)[12]; - ((int * ALIGNED(64))a)[12] = ((int * ALIGNED(64))b)[12]; - ((int * ALIGNED(64))b)[12] = t; - - t = ((int * ALIGNED(64))a)[13]; - ((int * ALIGNED(64))a)[13] = ((int * ALIGNED(64))b)[13]; - ((int * ALIGNED(64))b)[13] = t; - - t = ((int * ALIGNED(64))a)[14]; - ((int * ALIGNED(64))a)[14] = ((int * ALIGNED(64))b)[14]; - ((int * ALIGNED(64))b)[14] = t; - - t = ((int * ALIGNED(64))a)[15]; - ((int * ALIGNED(64))a)[15] = ((int * ALIGNED(64))b)[15]; - ((int * ALIGNED(64))b)[15] = t; - } - - // v16 transposed memory manipulation functions - - inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) - { - a.i[ 0] = ((const int *)a00)[0]; - a.i[ 1] = ((const int *)a01)[0]; - a.i[ 2] = ((const int *)a02)[0]; - a.i[ 3] = ((const int *)a03)[0]; - a.i[ 4] = ((const int *)a04)[0]; - a.i[ 5] = ((const int *)a05)[0]; - a.i[ 6] = ((const int *)a06)[0]; - a.i[ 7] = ((const int *)a07)[0]; - a.i[ 8] = ((const int *)a08)[0]; - a.i[ 9] = ((const int *)a09)[0]; - a.i[10] = ((const int *)a10)[0]; - a.i[11] = ((const int *)a11)[0]; - a.i[12] = ((const int *)a12)[0]; - a.i[13] = ((const int *)a13)[0]; - a.i[14] = ((const int *)a14)[0]; - a.i[15] = ((const int *)a15)[0]; - } - - inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) - { - a.i[ 0] = ((const int * ALIGNED(8))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(8))a00)[1]; - - a.i[ 1] = ((const int * ALIGNED(8))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(8))a01)[1]; - - a.i[ 2] = ((const int * ALIGNED(8))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(8))a02)[1]; - - a.i[ 3] = ((const int * ALIGNED(8))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(8))a03)[1]; - - a.i[ 4] = ((const int * ALIGNED(8))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(8))a04)[1]; - - a.i[ 5] = ((const int * ALIGNED(8))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(8))a05)[1]; - - a.i[ 6] = ((const int * ALIGNED(8))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(8))a06)[1]; - - a.i[ 7] = ((const int * ALIGNED(8))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(8))a07)[1]; - - a.i[ 8] = ((const int * ALIGNED(8))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(8))a08)[1]; - - a.i[ 9] = ((const int * ALIGNED(8))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(8))a09)[1]; - - a.i[10] = ((const int * ALIGNED(8))a10)[0]; - b.i[10] = ((const int * ALIGNED(8))a10)[1]; - - a.i[11] = ((const int * ALIGNED(8))a11)[0]; - b.i[11] = ((const int * ALIGNED(8))a11)[1]; - - a.i[12] = ((const int * ALIGNED(8))a12)[0]; - b.i[12] = ((const int * ALIGNED(8))a12)[1]; - - a.i[13] = ((const int * ALIGNED(8))a13)[0]; - b.i[13] = ((const int * ALIGNED(8))a13)[1]; - - a.i[14] = ((const int * ALIGNED(8))a14)[0]; - b.i[14] = ((const int * ALIGNED(8))a14)[1]; - - a.i[15] = ((const int * ALIGNED(8))a15)[0]; - b.i[15] = ((const int * ALIGNED(8))a15)[1]; - } - - inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - } - - inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - d.i[ 0] = ((const int * ALIGNED(64))a00)[3]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - d.i[ 1] = ((const int * ALIGNED(64))a01)[3]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - d.i[ 2] = ((const int * ALIGNED(64))a02)[3]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - d.i[ 3] = ((const int * ALIGNED(64))a03)[3]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - d.i[ 4] = ((const int * ALIGNED(64))a04)[3]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - d.i[ 5] = ((const int * ALIGNED(64))a05)[3]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - d.i[ 6] = ((const int * ALIGNED(64))a06)[3]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - d.i[ 7] = ((const int * ALIGNED(64))a07)[3]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - d.i[ 8] = ((const int * ALIGNED(64))a08)[3]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - d.i[ 9] = ((const int * ALIGNED(64))a09)[3]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - d.i[10] = ((const int * ALIGNED(64))a10)[3]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - d.i[11] = ((const int * ALIGNED(64))a11)[3]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - d.i[12] = ((const int * ALIGNED(64))a12)[3]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - d.i[13] = ((const int * ALIGNED(64))a13)[3]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - d.i[14] = ((const int * ALIGNED(64))a14)[3]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - d.i[15] = ((const int * ALIGNED(64))a15)[3]; - } - - inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - d.i[ 0] = ((const int * ALIGNED(64))a00)[3]; - e.i[ 0] = ((const int * ALIGNED(64))a00)[4]; - f.i[ 0] = ((const int * ALIGNED(64))a00)[5]; - g.i[ 0] = ((const int * ALIGNED(64))a00)[6]; - h.i[ 0] = ((const int * ALIGNED(64))a00)[7]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - d.i[ 1] = ((const int * ALIGNED(64))a01)[3]; - e.i[ 1] = ((const int * ALIGNED(64))a01)[4]; - f.i[ 1] = ((const int * ALIGNED(64))a01)[5]; - g.i[ 1] = ((const int * ALIGNED(64))a01)[6]; - h.i[ 1] = ((const int * ALIGNED(64))a01)[7]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - d.i[ 2] = ((const int * ALIGNED(64))a02)[3]; - e.i[ 2] = ((const int * ALIGNED(64))a02)[4]; - f.i[ 2] = ((const int * ALIGNED(64))a02)[5]; - g.i[ 2] = ((const int * ALIGNED(64))a02)[6]; - h.i[ 2] = ((const int * ALIGNED(64))a02)[7]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - d.i[ 3] = ((const int * ALIGNED(64))a03)[3]; - e.i[ 3] = ((const int * ALIGNED(64))a03)[4]; - f.i[ 3] = ((const int * ALIGNED(64))a03)[5]; - g.i[ 3] = ((const int * ALIGNED(64))a03)[6]; - h.i[ 3] = ((const int * ALIGNED(64))a03)[7]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - d.i[ 4] = ((const int * ALIGNED(64))a04)[3]; - e.i[ 4] = ((const int * ALIGNED(64))a04)[4]; - f.i[ 4] = ((const int * ALIGNED(64))a04)[5]; - g.i[ 4] = ((const int * ALIGNED(64))a04)[6]; - h.i[ 4] = ((const int * ALIGNED(64))a04)[7]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - d.i[ 5] = ((const int * ALIGNED(64))a05)[3]; - e.i[ 5] = ((const int * ALIGNED(64))a05)[4]; - f.i[ 5] = ((const int * ALIGNED(64))a05)[5]; - g.i[ 5] = ((const int * ALIGNED(64))a05)[6]; - h.i[ 5] = ((const int * ALIGNED(64))a05)[7]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - d.i[ 6] = ((const int * ALIGNED(64))a06)[3]; - e.i[ 6] = ((const int * ALIGNED(64))a06)[4]; - f.i[ 6] = ((const int * ALIGNED(64))a06)[5]; - g.i[ 6] = ((const int * ALIGNED(64))a06)[6]; - h.i[ 6] = ((const int * ALIGNED(64))a06)[7]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - d.i[ 7] = ((const int * ALIGNED(64))a07)[3]; - e.i[ 7] = ((const int * ALIGNED(64))a07)[4]; - f.i[ 7] = ((const int * ALIGNED(64))a07)[5]; - g.i[ 7] = ((const int * ALIGNED(64))a07)[6]; - h.i[ 7] = ((const int * ALIGNED(64))a07)[7]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - d.i[ 8] = ((const int * ALIGNED(64))a08)[3]; - e.i[ 8] = ((const int * ALIGNED(64))a08)[4]; - f.i[ 8] = ((const int * ALIGNED(64))a08)[5]; - g.i[ 8] = ((const int * ALIGNED(64))a08)[6]; - h.i[ 8] = ((const int * ALIGNED(64))a08)[7]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - d.i[ 9] = ((const int * ALIGNED(64))a09)[3]; - e.i[ 9] = ((const int * ALIGNED(64))a09)[4]; - f.i[ 9] = ((const int * ALIGNED(64))a09)[5]; - g.i[ 9] = ((const int * ALIGNED(64))a09)[6]; - h.i[ 9] = ((const int * ALIGNED(64))a09)[7]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - d.i[10] = ((const int * ALIGNED(64))a10)[3]; - e.i[10] = ((const int * ALIGNED(64))a10)[4]; - f.i[10] = ((const int * ALIGNED(64))a10)[5]; - g.i[10] = ((const int * ALIGNED(64))a10)[6]; - h.i[10] = ((const int * ALIGNED(64))a10)[7]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - d.i[11] = ((const int * ALIGNED(64))a11)[3]; - e.i[11] = ((const int * ALIGNED(64))a11)[4]; - f.i[11] = ((const int * ALIGNED(64))a11)[5]; - g.i[11] = ((const int * ALIGNED(64))a11)[6]; - h.i[11] = ((const int * ALIGNED(64))a11)[7]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - d.i[12] = ((const int * ALIGNED(64))a12)[3]; - e.i[12] = ((const int * ALIGNED(64))a12)[4]; - f.i[12] = ((const int * ALIGNED(64))a12)[5]; - g.i[12] = ((const int * ALIGNED(64))a12)[6]; - h.i[12] = ((const int * ALIGNED(64))a12)[7]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - d.i[13] = ((const int * ALIGNED(64))a13)[3]; - e.i[13] = ((const int * ALIGNED(64))a13)[4]; - f.i[13] = ((const int * ALIGNED(64))a13)[5]; - g.i[13] = ((const int * ALIGNED(64))a13)[6]; - h.i[13] = ((const int * ALIGNED(64))a13)[7]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - d.i[14] = ((const int * ALIGNED(64))a14)[3]; - e.i[14] = ((const int * ALIGNED(64))a14)[4]; - f.i[14] = ((const int * ALIGNED(64))a14)[5]; - g.i[14] = ((const int * ALIGNED(64))a14)[6]; - h.i[14] = ((const int * ALIGNED(64))a14)[7]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - d.i[15] = ((const int * ALIGNED(64))a15)[3]; - e.i[15] = ((const int * ALIGNED(64))a15)[4]; - f.i[15] = ((const int * ALIGNED(64))a15)[5]; - g.i[15] = ((const int * ALIGNED(64))a15)[6]; - h.i[15] = ((const int * ALIGNED(64))a15)[7]; - } - - inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b08.i[ 0] = ((const int * ALIGNED(64))a00)[ 8]; - b09.i[ 0] = ((const int * ALIGNED(64))a00)[ 9]; - b10.i[ 0] = ((const int * ALIGNED(64))a00)[10]; - b11.i[ 0] = ((const int * ALIGNED(64))a00)[11]; - b12.i[ 0] = ((const int * ALIGNED(64))a00)[12]; - b13.i[ 0] = ((const int * ALIGNED(64))a00)[13]; - b14.i[ 0] = ((const int * ALIGNED(64))a00)[14]; - b15.i[ 0] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 1] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 1] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 1] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 1] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 1] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 1] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 1] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 1] = ((const int * ALIGNED(64))a01)[ 7]; - b08.i[ 1] = ((const int * ALIGNED(64))a01)[ 8]; - b09.i[ 1] = ((const int * ALIGNED(64))a01)[ 9]; - b10.i[ 1] = ((const int * ALIGNED(64))a01)[10]; - b11.i[ 1] = ((const int * ALIGNED(64))a01)[11]; - b12.i[ 1] = ((const int * ALIGNED(64))a01)[12]; - b13.i[ 1] = ((const int * ALIGNED(64))a01)[13]; - b14.i[ 1] = ((const int * ALIGNED(64))a01)[14]; - b15.i[ 1] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a02)[ 7]; - b08.i[ 2] = ((const int * ALIGNED(64))a02)[ 8]; - b09.i[ 2] = ((const int * ALIGNED(64))a02)[ 9]; - b10.i[ 2] = ((const int * ALIGNED(64))a02)[10]; - b11.i[ 2] = ((const int * ALIGNED(64))a02)[11]; - b12.i[ 2] = ((const int * ALIGNED(64))a02)[12]; - b13.i[ 2] = ((const int * ALIGNED(64))a02)[13]; - b14.i[ 2] = ((const int * ALIGNED(64))a02)[14]; - b15.i[ 2] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 3] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 3] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 3] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 3] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 3] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 3] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 3] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 3] = ((const int * ALIGNED(64))a03)[ 7]; - b08.i[ 3] = ((const int * ALIGNED(64))a03)[ 8]; - b09.i[ 3] = ((const int * ALIGNED(64))a03)[ 9]; - b10.i[ 3] = ((const int * ALIGNED(64))a03)[10]; - b11.i[ 3] = ((const int * ALIGNED(64))a03)[11]; - b12.i[ 3] = ((const int * ALIGNED(64))a03)[12]; - b13.i[ 3] = ((const int * ALIGNED(64))a03)[13]; - b14.i[ 3] = ((const int * ALIGNED(64))a03)[14]; - b15.i[ 3] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a04)[ 7]; - b08.i[ 4] = ((const int * ALIGNED(64))a04)[ 8]; - b09.i[ 4] = ((const int * ALIGNED(64))a04)[ 9]; - b10.i[ 4] = ((const int * ALIGNED(64))a04)[10]; - b11.i[ 4] = ((const int * ALIGNED(64))a04)[11]; - b12.i[ 4] = ((const int * ALIGNED(64))a04)[12]; - b13.i[ 4] = ((const int * ALIGNED(64))a04)[13]; - b14.i[ 4] = ((const int * ALIGNED(64))a04)[14]; - b15.i[ 4] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[ 5] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[ 5] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[ 5] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[ 5] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[ 5] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[ 5] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[ 5] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[ 5] = ((const int * ALIGNED(64))a05)[ 7]; - b08.i[ 5] = ((const int * ALIGNED(64))a05)[ 8]; - b09.i[ 5] = ((const int * ALIGNED(64))a05)[ 9]; - b10.i[ 5] = ((const int * ALIGNED(64))a05)[10]; - b11.i[ 5] = ((const int * ALIGNED(64))a05)[11]; - b12.i[ 5] = ((const int * ALIGNED(64))a05)[12]; - b13.i[ 5] = ((const int * ALIGNED(64))a05)[13]; - b14.i[ 5] = ((const int * ALIGNED(64))a05)[14]; - b15.i[ 5] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a06)[ 7]; - b08.i[ 6] = ((const int * ALIGNED(64))a06)[ 8]; - b09.i[ 6] = ((const int * ALIGNED(64))a06)[ 9]; - b10.i[ 6] = ((const int * ALIGNED(64))a06)[10]; - b11.i[ 6] = ((const int * ALIGNED(64))a06)[11]; - b12.i[ 6] = ((const int * ALIGNED(64))a06)[12]; - b13.i[ 6] = ((const int * ALIGNED(64))a06)[13]; - b14.i[ 6] = ((const int * ALIGNED(64))a06)[14]; - b15.i[ 6] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[ 7] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[ 7] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[ 7] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[ 7] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[ 7] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[ 7] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[ 7] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[ 7] = ((const int * ALIGNED(64))a07)[ 7]; - b08.i[ 7] = ((const int * ALIGNED(64))a07)[ 8]; - b09.i[ 7] = ((const int * ALIGNED(64))a07)[ 9]; - b10.i[ 7] = ((const int * ALIGNED(64))a07)[10]; - b11.i[ 7] = ((const int * ALIGNED(64))a07)[11]; - b12.i[ 7] = ((const int * ALIGNED(64))a07)[12]; - b13.i[ 7] = ((const int * ALIGNED(64))a07)[13]; - b14.i[ 7] = ((const int * ALIGNED(64))a07)[14]; - b15.i[ 7] = ((const int * ALIGNED(64))a07)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a08)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a08)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a08)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a08)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a08)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a08)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a08)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a08)[ 7]; - b08.i[ 8] = ((const int * ALIGNED(64))a08)[ 8]; - b09.i[ 8] = ((const int * ALIGNED(64))a08)[ 9]; - b10.i[ 8] = ((const int * ALIGNED(64))a08)[10]; - b11.i[ 8] = ((const int * ALIGNED(64))a08)[11]; - b12.i[ 8] = ((const int * ALIGNED(64))a08)[12]; - b13.i[ 8] = ((const int * ALIGNED(64))a08)[13]; - b14.i[ 8] = ((const int * ALIGNED(64))a08)[14]; - b15.i[ 8] = ((const int * ALIGNED(64))a08)[15]; - - b00.i[ 9] = ((const int * ALIGNED(64))a09)[ 0]; - b01.i[ 9] = ((const int * ALIGNED(64))a09)[ 1]; - b02.i[ 9] = ((const int * ALIGNED(64))a09)[ 2]; - b03.i[ 9] = ((const int * ALIGNED(64))a09)[ 3]; - b04.i[ 9] = ((const int * ALIGNED(64))a09)[ 4]; - b05.i[ 9] = ((const int * ALIGNED(64))a09)[ 5]; - b06.i[ 9] = ((const int * ALIGNED(64))a09)[ 6]; - b07.i[ 9] = ((const int * ALIGNED(64))a09)[ 7]; - b08.i[ 9] = ((const int * ALIGNED(64))a09)[ 8]; - b09.i[ 9] = ((const int * ALIGNED(64))a09)[ 9]; - b10.i[ 9] = ((const int * ALIGNED(64))a09)[10]; - b11.i[ 9] = ((const int * ALIGNED(64))a09)[11]; - b12.i[ 9] = ((const int * ALIGNED(64))a09)[12]; - b13.i[ 9] = ((const int * ALIGNED(64))a09)[13]; - b14.i[ 9] = ((const int * ALIGNED(64))a09)[14]; - b15.i[ 9] = ((const int * ALIGNED(64))a09)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a10)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a10)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a10)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a10)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a10)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a10)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a10)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a10)[ 7]; - b08.i[10] = ((const int * ALIGNED(64))a10)[ 8]; - b09.i[10] = ((const int * ALIGNED(64))a10)[ 9]; - b10.i[10] = ((const int * ALIGNED(64))a10)[10]; - b11.i[10] = ((const int * ALIGNED(64))a10)[11]; - b12.i[10] = ((const int * ALIGNED(64))a10)[12]; - b13.i[10] = ((const int * ALIGNED(64))a10)[13]; - b14.i[10] = ((const int * ALIGNED(64))a10)[14]; - b15.i[10] = ((const int * ALIGNED(64))a10)[15]; - - b00.i[11] = ((const int * ALIGNED(64))a11)[ 0]; - b01.i[11] = ((const int * ALIGNED(64))a11)[ 1]; - b02.i[11] = ((const int * ALIGNED(64))a11)[ 2]; - b03.i[11] = ((const int * ALIGNED(64))a11)[ 3]; - b04.i[11] = ((const int * ALIGNED(64))a11)[ 4]; - b05.i[11] = ((const int * ALIGNED(64))a11)[ 5]; - b06.i[11] = ((const int * ALIGNED(64))a11)[ 6]; - b07.i[11] = ((const int * ALIGNED(64))a11)[ 7]; - b08.i[11] = ((const int * ALIGNED(64))a11)[ 8]; - b09.i[11] = ((const int * ALIGNED(64))a11)[ 9]; - b10.i[11] = ((const int * ALIGNED(64))a11)[10]; - b11.i[11] = ((const int * ALIGNED(64))a11)[11]; - b12.i[11] = ((const int * ALIGNED(64))a11)[12]; - b13.i[11] = ((const int * ALIGNED(64))a11)[13]; - b14.i[11] = ((const int * ALIGNED(64))a11)[14]; - b15.i[11] = ((const int * ALIGNED(64))a11)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a12)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a12)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a12)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a12)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a12)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a12)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a12)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a12)[ 7]; - b08.i[12] = ((const int * ALIGNED(64))a12)[ 8]; - b09.i[12] = ((const int * ALIGNED(64))a12)[ 9]; - b10.i[12] = ((const int * ALIGNED(64))a12)[10]; - b11.i[12] = ((const int * ALIGNED(64))a12)[11]; - b12.i[12] = ((const int * ALIGNED(64))a12)[12]; - b13.i[12] = ((const int * ALIGNED(64))a12)[13]; - b14.i[12] = ((const int * ALIGNED(64))a12)[14]; - b15.i[12] = ((const int * ALIGNED(64))a12)[15]; - - b00.i[13] = ((const int * ALIGNED(64))a13)[ 0]; - b01.i[13] = ((const int * ALIGNED(64))a13)[ 1]; - b02.i[13] = ((const int * ALIGNED(64))a13)[ 2]; - b03.i[13] = ((const int * ALIGNED(64))a13)[ 3]; - b04.i[13] = ((const int * ALIGNED(64))a13)[ 4]; - b05.i[13] = ((const int * ALIGNED(64))a13)[ 5]; - b06.i[13] = ((const int * ALIGNED(64))a13)[ 6]; - b07.i[13] = ((const int * ALIGNED(64))a13)[ 7]; - b08.i[13] = ((const int * ALIGNED(64))a13)[ 8]; - b09.i[13] = ((const int * ALIGNED(64))a13)[ 9]; - b10.i[13] = ((const int * ALIGNED(64))a13)[10]; - b11.i[13] = ((const int * ALIGNED(64))a13)[11]; - b12.i[13] = ((const int * ALIGNED(64))a13)[12]; - b13.i[13] = ((const int * ALIGNED(64))a13)[13]; - b14.i[13] = ((const int * ALIGNED(64))a13)[14]; - b15.i[13] = ((const int * ALIGNED(64))a13)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a14)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a14)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a14)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a14)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a14)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a14)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a14)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a14)[ 7]; - b08.i[14] = ((const int * ALIGNED(64))a14)[ 8]; - b09.i[14] = ((const int * ALIGNED(64))a14)[ 9]; - b10.i[14] = ((const int * ALIGNED(64))a14)[10]; - b11.i[14] = ((const int * ALIGNED(64))a14)[11]; - b12.i[14] = ((const int * ALIGNED(64))a14)[12]; - b13.i[14] = ((const int * ALIGNED(64))a14)[13]; - b14.i[14] = ((const int * ALIGNED(64))a14)[14]; - b15.i[14] = ((const int * ALIGNED(64))a14)[15]; - - b00.i[15] = ((const int * ALIGNED(64))a15)[ 0]; - b01.i[15] = ((const int * ALIGNED(64))a15)[ 1]; - b02.i[15] = ((const int * ALIGNED(64))a15)[ 2]; - b03.i[15] = ((const int * ALIGNED(64))a15)[ 3]; - b04.i[15] = ((const int * ALIGNED(64))a15)[ 4]; - b05.i[15] = ((const int * ALIGNED(64))a15)[ 5]; - b06.i[15] = ((const int * ALIGNED(64))a15)[ 6]; - b07.i[15] = ((const int * ALIGNED(64))a15)[ 7]; - b08.i[15] = ((const int * ALIGNED(64))a15)[ 8]; - b09.i[15] = ((const int * ALIGNED(64))a15)[ 9]; - b10.i[15] = ((const int * ALIGNED(64))a15)[10]; - b11.i[15] = ((const int * ALIGNED(64))a15)[11]; - b12.i[15] = ((const int * ALIGNED(64))a15)[12]; - b13.i[15] = ((const int * ALIGNED(64))a15)[13]; - b14.i[15] = ((const int * ALIGNED(64))a15)[14]; - b15.i[15] = ((const int * ALIGNED(64))a15)[15]; - } - - inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8]; - b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9]; - b02.i[ 1] = ((const int * ALIGNED(64))a00)[10]; - b03.i[ 1] = ((const int * ALIGNED(64))a00)[11]; - b04.i[ 1] = ((const int * ALIGNED(64))a00)[12]; - b05.i[ 1] = ((const int * ALIGNED(64))a00)[13]; - b06.i[ 1] = ((const int * ALIGNED(64))a00)[14]; - b07.i[ 1] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7]; - b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8]; - b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9]; - b02.i[ 3] = ((const int * ALIGNED(64))a01)[10]; - b03.i[ 3] = ((const int * ALIGNED(64))a01)[11]; - b04.i[ 3] = ((const int * ALIGNED(64))a01)[12]; - b05.i[ 3] = ((const int * ALIGNED(64))a01)[13]; - b06.i[ 3] = ((const int * ALIGNED(64))a01)[14]; - b07.i[ 3] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7]; - b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8]; - b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9]; - b02.i[ 5] = ((const int * ALIGNED(64))a02)[10]; - b03.i[ 5] = ((const int * ALIGNED(64))a02)[11]; - b04.i[ 5] = ((const int * ALIGNED(64))a02)[12]; - b05.i[ 5] = ((const int * ALIGNED(64))a02)[13]; - b06.i[ 5] = ((const int * ALIGNED(64))a02)[14]; - b07.i[ 5] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7]; - b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8]; - b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9]; - b02.i[ 7] = ((const int * ALIGNED(64))a03)[10]; - b03.i[ 7] = ((const int * ALIGNED(64))a03)[11]; - b04.i[ 7] = ((const int * ALIGNED(64))a03)[12]; - b05.i[ 7] = ((const int * ALIGNED(64))a03)[13]; - b06.i[ 7] = ((const int * ALIGNED(64))a03)[14]; - b07.i[ 7] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7]; - b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8]; - b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9]; - b02.i[ 9] = ((const int * ALIGNED(64))a04)[10]; - b03.i[ 9] = ((const int * ALIGNED(64))a04)[11]; - b04.i[ 9] = ((const int * ALIGNED(64))a04)[12]; - b05.i[ 9] = ((const int * ALIGNED(64))a04)[13]; - b06.i[ 9] = ((const int * ALIGNED(64))a04)[14]; - b07.i[ 9] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a05)[ 7]; - b00.i[11] = ((const int * ALIGNED(64))a05)[ 8]; - b01.i[11] = ((const int * ALIGNED(64))a05)[ 9]; - b02.i[11] = ((const int * ALIGNED(64))a05)[10]; - b03.i[11] = ((const int * ALIGNED(64))a05)[11]; - b04.i[11] = ((const int * ALIGNED(64))a05)[12]; - b05.i[11] = ((const int * ALIGNED(64))a05)[13]; - b06.i[11] = ((const int * ALIGNED(64))a05)[14]; - b07.i[11] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a06)[ 7]; - b00.i[13] = ((const int * ALIGNED(64))a06)[ 8]; - b01.i[13] = ((const int * ALIGNED(64))a06)[ 9]; - b02.i[13] = ((const int * ALIGNED(64))a06)[10]; - b03.i[13] = ((const int * ALIGNED(64))a06)[11]; - b04.i[13] = ((const int * ALIGNED(64))a06)[12]; - b05.i[13] = ((const int * ALIGNED(64))a06)[13]; - b06.i[13] = ((const int * ALIGNED(64))a06)[14]; - b07.i[13] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a07)[ 7]; - b00.i[15] = ((const int * ALIGNED(64))a07)[ 8]; - b01.i[15] = ((const int * ALIGNED(64))a07)[ 9]; - b02.i[15] = ((const int * ALIGNED(64))a07)[10]; - b03.i[15] = ((const int * ALIGNED(64))a07)[11]; - b04.i[15] = ((const int * ALIGNED(64))a07)[12]; - b05.i[15] = ((const int * ALIGNED(64))a07)[13]; - b06.i[15] = ((const int * ALIGNED(64))a07)[14]; - b07.i[15] = ((const int * ALIGNED(64))a07)[15]; - } - - inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8]; - b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9]; - b02.i[ 1] = ((const int * ALIGNED(64))a00)[10]; - b03.i[ 1] = ((const int * ALIGNED(64))a00)[11]; - b04.i[ 1] = ((const int * ALIGNED(64))a00)[12]; - b05.i[ 1] = ((const int * ALIGNED(64))a00)[13]; - b06.i[ 1] = ((const int * ALIGNED(64))a00)[14]; - b07.i[ 1] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7]; - b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8]; - b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9]; - b02.i[ 3] = ((const int * ALIGNED(64))a01)[10]; - b03.i[ 3] = ((const int * ALIGNED(64))a01)[11]; - b04.i[ 3] = ((const int * ALIGNED(64))a01)[12]; - b05.i[ 3] = ((const int * ALIGNED(64))a01)[13]; - b06.i[ 3] = ((const int * ALIGNED(64))a01)[14]; - b07.i[ 3] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7]; - b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8]; - b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9]; - b02.i[ 5] = ((const int * ALIGNED(64))a02)[10]; - b03.i[ 5] = ((const int * ALIGNED(64))a02)[11]; - b04.i[ 5] = ((const int * ALIGNED(64))a02)[12]; - b05.i[ 5] = ((const int * ALIGNED(64))a02)[13]; - b06.i[ 5] = ((const int * ALIGNED(64))a02)[14]; - b07.i[ 5] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7]; - b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8]; - b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9]; - b02.i[ 7] = ((const int * ALIGNED(64))a03)[10]; - b03.i[ 7] = ((const int * ALIGNED(64))a03)[11]; - b04.i[ 7] = ((const int * ALIGNED(64))a03)[12]; - b05.i[ 7] = ((const int * ALIGNED(64))a03)[13]; - b06.i[ 7] = ((const int * ALIGNED(64))a03)[14]; - b07.i[ 7] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7]; - b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8]; - b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9]; - b02.i[ 9] = ((const int * ALIGNED(64))a04)[10]; - b03.i[ 9] = ((const int * ALIGNED(64))a04)[11]; - b04.i[ 9] = ((const int * ALIGNED(64))a04)[12]; - b05.i[ 9] = ((const int * ALIGNED(64))a04)[13]; - b06.i[ 9] = ((const int * ALIGNED(64))a04)[14]; - b07.i[ 9] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a05)[ 7]; - b00.i[11] = ((const int * ALIGNED(64))a05)[ 8]; - b01.i[11] = ((const int * ALIGNED(64))a05)[ 9]; - b02.i[11] = ((const int * ALIGNED(64))a05)[10]; - b03.i[11] = ((const int * ALIGNED(64))a05)[11]; - b04.i[11] = ((const int * ALIGNED(64))a05)[12]; - b05.i[11] = ((const int * ALIGNED(64))a05)[13]; - b06.i[11] = ((const int * ALIGNED(64))a05)[14]; - b07.i[11] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a06)[ 7]; - b00.i[13] = ((const int * ALIGNED(64))a06)[ 8]; - b01.i[13] = ((const int * ALIGNED(64))a06)[ 9]; - b02.i[13] = ((const int * ALIGNED(64))a06)[10]; - b03.i[13] = ((const int * ALIGNED(64))a06)[11]; - b04.i[13] = ((const int * ALIGNED(64))a06)[12]; - b05.i[13] = ((const int * ALIGNED(64))a06)[13]; - b06.i[13] = ((const int * ALIGNED(64))a06)[14]; - b07.i[13] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a07)[ 7]; - b00.i[15] = ((const int * ALIGNED(64))a07)[ 8]; - b01.i[15] = ((const int * ALIGNED(64))a07)[ 9]; - b02.i[15] = ((const int * ALIGNED(64))a07)[10]; - b03.i[15] = ((const int * ALIGNED(64))a07)[11]; - b04.i[15] = ((const int * ALIGNED(64))a07)[12]; - b05.i[15] = ((const int * ALIGNED(64))a07)[13]; - b06.i[15] = ((const int * ALIGNED(64))a07)[14]; - b07.i[15] = ((const int * ALIGNED(64))a07)[15]; - - b08.i[ 0] = ((const int * ALIGNED(64))a08)[ 0]; - b09.i[ 0] = ((const int * ALIGNED(64))a08)[ 1]; - b10.i[ 0] = ((const int * ALIGNED(64))a08)[ 2]; - b11.i[ 0] = ((const int * ALIGNED(64))a08)[ 3]; - b12.i[ 0] = ((const int * ALIGNED(64))a08)[ 4]; - b13.i[ 0] = ((const int * ALIGNED(64))a08)[ 5]; - b14.i[ 0] = ((const int * ALIGNED(64))a08)[ 6]; - b15.i[ 0] = ((const int * ALIGNED(64))a08)[ 7]; - b08.i[ 1] = ((const int * ALIGNED(64))a08)[ 8]; - b09.i[ 1] = ((const int * ALIGNED(64))a08)[ 9]; - b10.i[ 1] = ((const int * ALIGNED(64))a08)[10]; - b11.i[ 1] = ((const int * ALIGNED(64))a08)[11]; - b12.i[ 1] = ((const int * ALIGNED(64))a08)[12]; - b13.i[ 1] = ((const int * ALIGNED(64))a08)[13]; - b14.i[ 1] = ((const int * ALIGNED(64))a08)[14]; - b15.i[ 1] = ((const int * ALIGNED(64))a08)[15]; - - b08.i[ 2] = ((const int * ALIGNED(64))a09)[ 0]; - b09.i[ 2] = ((const int * ALIGNED(64))a09)[ 1]; - b10.i[ 2] = ((const int * ALIGNED(64))a09)[ 2]; - b11.i[ 2] = ((const int * ALIGNED(64))a09)[ 3]; - b12.i[ 2] = ((const int * ALIGNED(64))a09)[ 4]; - b13.i[ 2] = ((const int * ALIGNED(64))a09)[ 5]; - b14.i[ 2] = ((const int * ALIGNED(64))a09)[ 6]; - b15.i[ 2] = ((const int * ALIGNED(64))a09)[ 7]; - b08.i[ 3] = ((const int * ALIGNED(64))a09)[ 8]; - b09.i[ 3] = ((const int * ALIGNED(64))a09)[ 9]; - b10.i[ 3] = ((const int * ALIGNED(64))a09)[10]; - b11.i[ 3] = ((const int * ALIGNED(64))a09)[11]; - b12.i[ 3] = ((const int * ALIGNED(64))a09)[12]; - b13.i[ 3] = ((const int * ALIGNED(64))a09)[13]; - b14.i[ 3] = ((const int * ALIGNED(64))a09)[14]; - b15.i[ 3] = ((const int * ALIGNED(64))a09)[15]; - - b08.i[ 4] = ((const int * ALIGNED(64))a10)[ 0]; - b09.i[ 4] = ((const int * ALIGNED(64))a10)[ 1]; - b10.i[ 4] = ((const int * ALIGNED(64))a10)[ 2]; - b11.i[ 4] = ((const int * ALIGNED(64))a10)[ 3]; - b12.i[ 4] = ((const int * ALIGNED(64))a10)[ 4]; - b13.i[ 4] = ((const int * ALIGNED(64))a10)[ 5]; - b14.i[ 4] = ((const int * ALIGNED(64))a10)[ 6]; - b15.i[ 4] = ((const int * ALIGNED(64))a10)[ 7]; - b08.i[ 5] = ((const int * ALIGNED(64))a10)[ 8]; - b09.i[ 5] = ((const int * ALIGNED(64))a10)[ 9]; - b10.i[ 5] = ((const int * ALIGNED(64))a10)[10]; - b11.i[ 5] = ((const int * ALIGNED(64))a10)[11]; - b12.i[ 5] = ((const int * ALIGNED(64))a10)[12]; - b13.i[ 5] = ((const int * ALIGNED(64))a10)[13]; - b14.i[ 5] = ((const int * ALIGNED(64))a10)[14]; - b15.i[ 5] = ((const int * ALIGNED(64))a10)[15]; - - b08.i[ 6] = ((const int * ALIGNED(64))a11)[ 0]; - b09.i[ 6] = ((const int * ALIGNED(64))a11)[ 1]; - b10.i[ 6] = ((const int * ALIGNED(64))a11)[ 2]; - b11.i[ 6] = ((const int * ALIGNED(64))a11)[ 3]; - b12.i[ 6] = ((const int * ALIGNED(64))a11)[ 4]; - b13.i[ 6] = ((const int * ALIGNED(64))a11)[ 5]; - b14.i[ 6] = ((const int * ALIGNED(64))a11)[ 6]; - b15.i[ 6] = ((const int * ALIGNED(64))a11)[ 7]; - b08.i[ 7] = ((const int * ALIGNED(64))a11)[ 8]; - b09.i[ 7] = ((const int * ALIGNED(64))a11)[ 9]; - b10.i[ 7] = ((const int * ALIGNED(64))a11)[10]; - b11.i[ 7] = ((const int * ALIGNED(64))a11)[11]; - b12.i[ 7] = ((const int * ALIGNED(64))a11)[12]; - b13.i[ 7] = ((const int * ALIGNED(64))a11)[13]; - b14.i[ 7] = ((const int * ALIGNED(64))a11)[14]; - b15.i[ 7] = ((const int * ALIGNED(64))a11)[15]; - - b08.i[ 8] = ((const int * ALIGNED(64))a12)[ 0]; - b09.i[ 8] = ((const int * ALIGNED(64))a12)[ 1]; - b10.i[ 8] = ((const int * ALIGNED(64))a12)[ 2]; - b11.i[ 8] = ((const int * ALIGNED(64))a12)[ 3]; - b12.i[ 8] = ((const int * ALIGNED(64))a12)[ 4]; - b13.i[ 8] = ((const int * ALIGNED(64))a12)[ 5]; - b14.i[ 8] = ((const int * ALIGNED(64))a12)[ 6]; - b15.i[ 8] = ((const int * ALIGNED(64))a12)[ 7]; - b08.i[ 9] = ((const int * ALIGNED(64))a12)[ 8]; - b09.i[ 9] = ((const int * ALIGNED(64))a12)[ 9]; - b10.i[ 9] = ((const int * ALIGNED(64))a12)[10]; - b11.i[ 9] = ((const int * ALIGNED(64))a12)[11]; - b12.i[ 9] = ((const int * ALIGNED(64))a12)[12]; - b13.i[ 9] = ((const int * ALIGNED(64))a12)[13]; - b14.i[ 9] = ((const int * ALIGNED(64))a12)[14]; - b15.i[ 9] = ((const int * ALIGNED(64))a12)[15]; - - b08.i[10] = ((const int * ALIGNED(64))a13)[ 0]; - b09.i[10] = ((const int * ALIGNED(64))a13)[ 1]; - b10.i[10] = ((const int * ALIGNED(64))a13)[ 2]; - b11.i[10] = ((const int * ALIGNED(64))a13)[ 3]; - b12.i[10] = ((const int * ALIGNED(64))a13)[ 4]; - b13.i[10] = ((const int * ALIGNED(64))a13)[ 5]; - b14.i[10] = ((const int * ALIGNED(64))a13)[ 6]; - b15.i[10] = ((const int * ALIGNED(64))a13)[ 7]; - b08.i[11] = ((const int * ALIGNED(64))a13)[ 8]; - b09.i[11] = ((const int * ALIGNED(64))a13)[ 9]; - b10.i[11] = ((const int * ALIGNED(64))a13)[10]; - b11.i[11] = ((const int * ALIGNED(64))a13)[11]; - b12.i[11] = ((const int * ALIGNED(64))a13)[12]; - b13.i[11] = ((const int * ALIGNED(64))a13)[13]; - b14.i[11] = ((const int * ALIGNED(64))a13)[14]; - b15.i[11] = ((const int * ALIGNED(64))a13)[15]; - - b08.i[12] = ((const int * ALIGNED(64))a14)[ 0]; - b09.i[12] = ((const int * ALIGNED(64))a14)[ 1]; - b10.i[12] = ((const int * ALIGNED(64))a14)[ 2]; - b11.i[12] = ((const int * ALIGNED(64))a14)[ 3]; - b12.i[12] = ((const int * ALIGNED(64))a14)[ 4]; - b13.i[12] = ((const int * ALIGNED(64))a14)[ 5]; - b14.i[12] = ((const int * ALIGNED(64))a14)[ 6]; - b15.i[12] = ((const int * ALIGNED(64))a14)[ 7]; - b08.i[13] = ((const int * ALIGNED(64))a14)[ 8]; - b09.i[13] = ((const int * ALIGNED(64))a14)[ 9]; - b10.i[13] = ((const int * ALIGNED(64))a14)[10]; - b11.i[13] = ((const int * ALIGNED(64))a14)[11]; - b12.i[13] = ((const int * ALIGNED(64))a14)[12]; - b13.i[13] = ((const int * ALIGNED(64))a14)[13]; - b14.i[13] = ((const int * ALIGNED(64))a14)[14]; - b15.i[13] = ((const int * ALIGNED(64))a14)[15]; - - b08.i[14] = ((const int * ALIGNED(64))a15)[ 0]; - b09.i[14] = ((const int * ALIGNED(64))a15)[ 1]; - b10.i[14] = ((const int * ALIGNED(64))a15)[ 2]; - b11.i[14] = ((const int * ALIGNED(64))a15)[ 3]; - b12.i[14] = ((const int * ALIGNED(64))a15)[ 4]; - b13.i[14] = ((const int * ALIGNED(64))a15)[ 5]; - b14.i[14] = ((const int * ALIGNED(64))a15)[ 6]; - b15.i[14] = ((const int * ALIGNED(64))a15)[ 7]; - b08.i[15] = ((const int * ALIGNED(64))a15)[ 8]; - b09.i[15] = ((const int * ALIGNED(64))a15)[ 9]; - b10.i[15] = ((const int * ALIGNED(64))a15)[10]; - b11.i[15] = ((const int * ALIGNED(64))a15)[11]; - b12.i[15] = ((const int * ALIGNED(64))a15)[12]; - b13.i[15] = ((const int * ALIGNED(64))a15)[13]; - b14.i[15] = ((const int * ALIGNED(64))a15)[14]; - b15.i[15] = ((const int * ALIGNED(64))a15)[15]; - } - - inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) - { - ((int *)a00)[0] = a.i[ 0]; - ((int *)a01)[0] = a.i[ 1]; - ((int *)a02)[0] = a.i[ 2]; - ((int *)a03)[0] = a.i[ 3]; - ((int *)a04)[0] = a.i[ 4]; - ((int *)a05)[0] = a.i[ 5]; - ((int *)a06)[0] = a.i[ 6]; - ((int *)a07)[0] = a.i[ 7]; - ((int *)a08)[0] = a.i[ 8]; - ((int *)a09)[0] = a.i[ 9]; - ((int *)a10)[0] = a.i[10]; - ((int *)a11)[0] = a.i[11]; - ((int *)a12)[0] = a.i[12]; - ((int *)a13)[0] = a.i[13]; - ((int *)a14)[0] = a.i[14]; - ((int *)a15)[0] = a.i[15]; - } - - inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, void * ALIGNED(8) a01, - void * ALIGNED(8) a02, void * ALIGNED(8) a03, - void * ALIGNED(8) a04, void * ALIGNED(8) a05, - void * ALIGNED(8) a06, void * ALIGNED(8) a07, - void * ALIGNED(8) a08, void * ALIGNED(8) a09, - void * ALIGNED(8) a10, void * ALIGNED(8) a11, - void * ALIGNED(8) a12, void * ALIGNED(8) a13, - void * ALIGNED(8) a14, void * ALIGNED(8) a15 ) - { - ((int * ALIGNED(8))a00)[0] = a.i[ 0]; - ((int * ALIGNED(8))a00)[1] = b.i[ 0]; - - ((int * ALIGNED(8))a01)[0] = a.i[ 1]; - ((int * ALIGNED(8))a01)[1] = b.i[ 1]; - - ((int * ALIGNED(8))a02)[0] = a.i[ 2]; - ((int * ALIGNED(8))a02)[1] = b.i[ 2]; - - ((int * ALIGNED(8))a03)[0] = a.i[ 3]; - ((int * ALIGNED(8))a03)[1] = b.i[ 3]; - - ((int * ALIGNED(8))a04)[0] = a.i[ 4]; - ((int * ALIGNED(8))a04)[1] = b.i[ 4]; - - ((int * ALIGNED(8))a05)[0] = a.i[ 5]; - ((int * ALIGNED(8))a05)[1] = b.i[ 5]; - - ((int * ALIGNED(8))a06)[0] = a.i[ 6]; - ((int * ALIGNED(8))a06)[1] = b.i[ 6]; - - ((int * ALIGNED(8))a07)[0] = a.i[ 7]; - ((int * ALIGNED(8))a07)[1] = b.i[ 7]; - - ((int * ALIGNED(8))a08)[0] = a.i[ 8]; - ((int * ALIGNED(8))a08)[1] = b.i[ 8]; - - ((int * ALIGNED(8))a09)[0] = a.i[ 9]; - ((int * ALIGNED(8))a09)[1] = b.i[ 9]; - - ((int * ALIGNED(8))a10)[0] = a.i[10]; - ((int * ALIGNED(8))a10)[1] = b.i[10]; - - ((int * ALIGNED(8))a11)[0] = a.i[11]; - ((int * ALIGNED(8))a11)[1] = b.i[11]; - - ((int * ALIGNED(8))a12)[0] = a.i[12]; - ((int * ALIGNED(8))a12)[1] = b.i[12]; - - ((int * ALIGNED(8))a13)[0] = a.i[13]; - ((int * ALIGNED(8))a13)[1] = b.i[13]; - - ((int * ALIGNED(8))a14)[0] = a.i[14]; - ((int * ALIGNED(8))a14)[1] = b.i[14]; - - ((int * ALIGNED(8))a15)[0] = a.i[15]; - ((int * ALIGNED(8))a15)[1] = b.i[15]; - } - - inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - } - - inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - } - - inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - const v16 &e, const v16 &f, const v16 &g, const v16 &h, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - ((int * ALIGNED(64))a00)[4] = e.i[ 0]; - ((int * ALIGNED(64))a00)[5] = f.i[ 0]; - ((int * ALIGNED(64))a00)[6] = g.i[ 0]; - ((int * ALIGNED(64))a00)[7] = h.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - ((int * ALIGNED(64))a01)[4] = e.i[ 1]; - ((int * ALIGNED(64))a01)[5] = f.i[ 1]; - ((int * ALIGNED(64))a01)[6] = g.i[ 1]; - ((int * ALIGNED(64))a01)[7] = h.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - ((int * ALIGNED(64))a02)[4] = e.i[ 2]; - ((int * ALIGNED(64))a02)[5] = f.i[ 2]; - ((int * ALIGNED(64))a02)[6] = g.i[ 2]; - ((int * ALIGNED(64))a02)[7] = h.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - ((int * ALIGNED(64))a03)[4] = e.i[ 3]; - ((int * ALIGNED(64))a03)[5] = f.i[ 3]; - ((int * ALIGNED(64))a03)[6] = g.i[ 3]; - ((int * ALIGNED(64))a03)[7] = h.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - ((int * ALIGNED(64))a04)[4] = e.i[ 4]; - ((int * ALIGNED(64))a04)[5] = f.i[ 4]; - ((int * ALIGNED(64))a04)[6] = g.i[ 4]; - ((int * ALIGNED(64))a04)[7] = h.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - ((int * ALIGNED(64))a05)[4] = e.i[ 5]; - ((int * ALIGNED(64))a05)[5] = f.i[ 5]; - ((int * ALIGNED(64))a05)[6] = g.i[ 5]; - ((int * ALIGNED(64))a05)[7] = h.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - ((int * ALIGNED(64))a06)[4] = e.i[ 6]; - ((int * ALIGNED(64))a06)[5] = f.i[ 6]; - ((int * ALIGNED(64))a06)[6] = g.i[ 6]; - ((int * ALIGNED(64))a06)[7] = h.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - ((int * ALIGNED(64))a07)[4] = e.i[ 7]; - ((int * ALIGNED(64))a07)[5] = f.i[ 7]; - ((int * ALIGNED(64))a07)[6] = g.i[ 7]; - ((int * ALIGNED(64))a07)[7] = h.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - ((int * ALIGNED(64))a08)[4] = e.i[ 8]; - ((int * ALIGNED(64))a08)[5] = f.i[ 8]; - ((int * ALIGNED(64))a08)[6] = g.i[ 8]; - ((int * ALIGNED(64))a08)[7] = h.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - ((int * ALIGNED(64))a09)[4] = e.i[ 9]; - ((int * ALIGNED(64))a09)[5] = f.i[ 9]; - ((int * ALIGNED(64))a09)[6] = g.i[ 9]; - ((int * ALIGNED(64))a09)[7] = h.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - ((int * ALIGNED(64))a10)[4] = e.i[10]; - ((int * ALIGNED(64))a10)[5] = f.i[10]; - ((int * ALIGNED(64))a10)[6] = g.i[10]; - ((int * ALIGNED(64))a10)[7] = h.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - ((int * ALIGNED(64))a11)[4] = e.i[11]; - ((int * ALIGNED(64))a11)[5] = f.i[11]; - ((int * ALIGNED(64))a11)[6] = g.i[11]; - ((int * ALIGNED(64))a11)[7] = h.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - ((int * ALIGNED(64))a12)[4] = e.i[12]; - ((int * ALIGNED(64))a12)[5] = f.i[12]; - ((int * ALIGNED(64))a12)[6] = g.i[12]; - ((int * ALIGNED(64))a12)[7] = h.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - ((int * ALIGNED(64))a13)[4] = e.i[13]; - ((int * ALIGNED(64))a13)[5] = f.i[13]; - ((int * ALIGNED(64))a13)[6] = g.i[13]; - ((int * ALIGNED(64))a13)[7] = h.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - ((int * ALIGNED(64))a14)[4] = e.i[14]; - ((int * ALIGNED(64))a14)[5] = f.i[14]; - ((int * ALIGNED(64))a14)[6] = g.i[14]; - ((int * ALIGNED(64))a14)[7] = h.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - ((int * ALIGNED(64))a15)[4] = e.i[15]; - ((int * ALIGNED(64))a15)[5] = f.i[15]; - ((int * ALIGNED(64))a15)[6] = g.i[15]; - ((int * ALIGNED(64))a15)[7] = h.i[15]; - } - - inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b08.i[ 0]; - ((int * ALIGNED(64))a00)[ 9] = b09.i[ 0]; - ((int * ALIGNED(64))a00)[10] = b10.i[ 0]; - ((int * ALIGNED(64))a00)[11] = b11.i[ 0]; - ((int * ALIGNED(64))a00)[12] = b12.i[ 0]; - ((int * ALIGNED(64))a00)[13] = b13.i[ 0]; - ((int * ALIGNED(64))a00)[14] = b14.i[ 0]; - ((int * ALIGNED(64))a00)[15] = b15.i[ 0]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 1]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 1]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 1]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 1]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 1]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 1]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 1]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 1]; - ((int * ALIGNED(64))a01)[ 8] = b08.i[ 1]; - ((int * ALIGNED(64))a01)[ 9] = b09.i[ 1]; - ((int * ALIGNED(64))a01)[10] = b10.i[ 1]; - ((int * ALIGNED(64))a01)[11] = b11.i[ 1]; - ((int * ALIGNED(64))a01)[12] = b12.i[ 1]; - ((int * ALIGNED(64))a01)[13] = b13.i[ 1]; - ((int * ALIGNED(64))a01)[14] = b14.i[ 1]; - ((int * ALIGNED(64))a01)[15] = b15.i[ 1]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a02)[ 8] = b08.i[ 2]; - ((int * ALIGNED(64))a02)[ 9] = b09.i[ 2]; - ((int * ALIGNED(64))a02)[10] = b10.i[ 2]; - ((int * ALIGNED(64))a02)[11] = b11.i[ 2]; - ((int * ALIGNED(64))a02)[12] = b12.i[ 2]; - ((int * ALIGNED(64))a02)[13] = b13.i[ 2]; - ((int * ALIGNED(64))a02)[14] = b14.i[ 2]; - ((int * ALIGNED(64))a02)[15] = b15.i[ 2]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 3]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 3]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 3]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 3]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 3]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 3]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 3]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 3]; - ((int * ALIGNED(64))a03)[ 8] = b08.i[ 3]; - ((int * ALIGNED(64))a03)[ 9] = b09.i[ 3]; - ((int * ALIGNED(64))a03)[10] = b10.i[ 3]; - ((int * ALIGNED(64))a03)[11] = b11.i[ 3]; - ((int * ALIGNED(64))a03)[12] = b12.i[ 3]; - ((int * ALIGNED(64))a03)[13] = b13.i[ 3]; - ((int * ALIGNED(64))a03)[14] = b14.i[ 3]; - ((int * ALIGNED(64))a03)[15] = b15.i[ 3]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a04)[ 8] = b08.i[ 4]; - ((int * ALIGNED(64))a04)[ 9] = b09.i[ 4]; - ((int * ALIGNED(64))a04)[10] = b10.i[ 4]; - ((int * ALIGNED(64))a04)[11] = b11.i[ 4]; - ((int * ALIGNED(64))a04)[12] = b12.i[ 4]; - ((int * ALIGNED(64))a04)[13] = b13.i[ 4]; - ((int * ALIGNED(64))a04)[14] = b14.i[ 4]; - ((int * ALIGNED(64))a04)[15] = b15.i[ 4]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[ 5]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[ 5]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[ 5]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[ 5]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[ 5]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[ 5]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[ 5]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[ 5]; - ((int * ALIGNED(64))a05)[ 8] = b08.i[ 5]; - ((int * ALIGNED(64))a05)[ 9] = b09.i[ 5]; - ((int * ALIGNED(64))a05)[10] = b10.i[ 5]; - ((int * ALIGNED(64))a05)[11] = b11.i[ 5]; - ((int * ALIGNED(64))a05)[12] = b12.i[ 5]; - ((int * ALIGNED(64))a05)[13] = b13.i[ 5]; - ((int * ALIGNED(64))a05)[14] = b14.i[ 5]; - ((int * ALIGNED(64))a05)[15] = b15.i[ 5]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a06)[ 8] = b08.i[ 6]; - ((int * ALIGNED(64))a06)[ 9] = b09.i[ 6]; - ((int * ALIGNED(64))a06)[10] = b10.i[ 6]; - ((int * ALIGNED(64))a06)[11] = b11.i[ 6]; - ((int * ALIGNED(64))a06)[12] = b12.i[ 6]; - ((int * ALIGNED(64))a06)[13] = b13.i[ 6]; - ((int * ALIGNED(64))a06)[14] = b14.i[ 6]; - ((int * ALIGNED(64))a06)[15] = b15.i[ 6]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[ 7]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[ 7]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[ 7]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[ 7]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[ 7]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[ 7]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[ 7]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[ 7]; - ((int * ALIGNED(64))a07)[ 8] = b08.i[ 7]; - ((int * ALIGNED(64))a07)[ 9] = b09.i[ 7]; - ((int * ALIGNED(64))a07)[10] = b10.i[ 7]; - ((int * ALIGNED(64))a07)[11] = b11.i[ 7]; - ((int * ALIGNED(64))a07)[12] = b12.i[ 7]; - ((int * ALIGNED(64))a07)[13] = b13.i[ 7]; - ((int * ALIGNED(64))a07)[14] = b14.i[ 7]; - ((int * ALIGNED(64))a07)[15] = b15.i[ 7]; - - ((int * ALIGNED(64))a08)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a08)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a08)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a08)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a08)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a08)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a08)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a08)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a08)[ 8] = b08.i[ 8]; - ((int * ALIGNED(64))a08)[ 9] = b09.i[ 8]; - ((int * ALIGNED(64))a08)[10] = b10.i[ 8]; - ((int * ALIGNED(64))a08)[11] = b11.i[ 8]; - ((int * ALIGNED(64))a08)[12] = b12.i[ 8]; - ((int * ALIGNED(64))a08)[13] = b13.i[ 8]; - ((int * ALIGNED(64))a08)[14] = b14.i[ 8]; - ((int * ALIGNED(64))a08)[15] = b15.i[ 8]; - - ((int * ALIGNED(64))a09)[ 0] = b00.i[ 9]; - ((int * ALIGNED(64))a09)[ 1] = b01.i[ 9]; - ((int * ALIGNED(64))a09)[ 2] = b02.i[ 9]; - ((int * ALIGNED(64))a09)[ 3] = b03.i[ 9]; - ((int * ALIGNED(64))a09)[ 4] = b04.i[ 9]; - ((int * ALIGNED(64))a09)[ 5] = b05.i[ 9]; - ((int * ALIGNED(64))a09)[ 6] = b06.i[ 9]; - ((int * ALIGNED(64))a09)[ 7] = b07.i[ 9]; - ((int * ALIGNED(64))a09)[ 8] = b08.i[ 9]; - ((int * ALIGNED(64))a09)[ 9] = b09.i[ 9]; - ((int * ALIGNED(64))a09)[10] = b10.i[ 9]; - ((int * ALIGNED(64))a09)[11] = b11.i[ 9]; - ((int * ALIGNED(64))a09)[12] = b12.i[ 9]; - ((int * ALIGNED(64))a09)[13] = b13.i[ 9]; - ((int * ALIGNED(64))a09)[14] = b14.i[ 9]; - ((int * ALIGNED(64))a09)[15] = b15.i[ 9]; - - ((int * ALIGNED(64))a10)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a10)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a10)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a10)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a10)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a10)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a10)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a10)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a10)[ 8] = b08.i[10]; - ((int * ALIGNED(64))a10)[ 9] = b09.i[10]; - ((int * ALIGNED(64))a10)[10] = b10.i[10]; - ((int * ALIGNED(64))a10)[11] = b11.i[10]; - ((int * ALIGNED(64))a10)[12] = b12.i[10]; - ((int * ALIGNED(64))a10)[13] = b13.i[10]; - ((int * ALIGNED(64))a10)[14] = b14.i[10]; - ((int * ALIGNED(64))a10)[15] = b15.i[10]; - - ((int * ALIGNED(64))a11)[ 0] = b00.i[11]; - ((int * ALIGNED(64))a11)[ 1] = b01.i[11]; - ((int * ALIGNED(64))a11)[ 2] = b02.i[11]; - ((int * ALIGNED(64))a11)[ 3] = b03.i[11]; - ((int * ALIGNED(64))a11)[ 4] = b04.i[11]; - ((int * ALIGNED(64))a11)[ 5] = b05.i[11]; - ((int * ALIGNED(64))a11)[ 6] = b06.i[11]; - ((int * ALIGNED(64))a11)[ 7] = b07.i[11]; - ((int * ALIGNED(64))a11)[ 8] = b08.i[11]; - ((int * ALIGNED(64))a11)[ 9] = b09.i[11]; - ((int * ALIGNED(64))a11)[10] = b10.i[11]; - ((int * ALIGNED(64))a11)[11] = b11.i[11]; - ((int * ALIGNED(64))a11)[12] = b12.i[11]; - ((int * ALIGNED(64))a11)[13] = b13.i[11]; - ((int * ALIGNED(64))a11)[14] = b14.i[11]; - ((int * ALIGNED(64))a11)[15] = b15.i[11]; - - ((int * ALIGNED(64))a12)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a12)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a12)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a12)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a12)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a12)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a12)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a12)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a12)[ 8] = b08.i[12]; - ((int * ALIGNED(64))a12)[ 9] = b09.i[12]; - ((int * ALIGNED(64))a12)[10] = b10.i[12]; - ((int * ALIGNED(64))a12)[11] = b11.i[12]; - ((int * ALIGNED(64))a12)[12] = b12.i[12]; - ((int * ALIGNED(64))a12)[13] = b13.i[12]; - ((int * ALIGNED(64))a12)[14] = b14.i[12]; - ((int * ALIGNED(64))a12)[15] = b15.i[12]; - - ((int * ALIGNED(64))a13)[ 0] = b00.i[13]; - ((int * ALIGNED(64))a13)[ 1] = b01.i[13]; - ((int * ALIGNED(64))a13)[ 2] = b02.i[13]; - ((int * ALIGNED(64))a13)[ 3] = b03.i[13]; - ((int * ALIGNED(64))a13)[ 4] = b04.i[13]; - ((int * ALIGNED(64))a13)[ 5] = b05.i[13]; - ((int * ALIGNED(64))a13)[ 6] = b06.i[13]; - ((int * ALIGNED(64))a13)[ 7] = b07.i[13]; - ((int * ALIGNED(64))a13)[ 8] = b08.i[13]; - ((int * ALIGNED(64))a13)[ 9] = b09.i[13]; - ((int * ALIGNED(64))a13)[10] = b10.i[13]; - ((int * ALIGNED(64))a13)[11] = b11.i[13]; - ((int * ALIGNED(64))a13)[12] = b12.i[13]; - ((int * ALIGNED(64))a13)[13] = b13.i[13]; - ((int * ALIGNED(64))a13)[14] = b14.i[13]; - ((int * ALIGNED(64))a13)[15] = b15.i[13]; - - ((int * ALIGNED(64))a14)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a14)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a14)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a14)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a14)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a14)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a14)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a14)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a14)[ 8] = b08.i[14]; - ((int * ALIGNED(64))a14)[ 9] = b09.i[14]; - ((int * ALIGNED(64))a14)[10] = b10.i[14]; - ((int * ALIGNED(64))a14)[11] = b11.i[14]; - ((int * ALIGNED(64))a14)[12] = b12.i[14]; - ((int * ALIGNED(64))a14)[13] = b13.i[14]; - ((int * ALIGNED(64))a14)[14] = b14.i[14]; - ((int * ALIGNED(64))a14)[15] = b15.i[14]; - - ((int * ALIGNED(64))a15)[ 0] = b00.i[15]; - ((int * ALIGNED(64))a15)[ 1] = b01.i[15]; - ((int * ALIGNED(64))a15)[ 2] = b02.i[15]; - ((int * ALIGNED(64))a15)[ 3] = b03.i[15]; - ((int * ALIGNED(64))a15)[ 4] = b04.i[15]; - ((int * ALIGNED(64))a15)[ 5] = b05.i[15]; - ((int * ALIGNED(64))a15)[ 6] = b06.i[15]; - ((int * ALIGNED(64))a15)[ 7] = b07.i[15]; - ((int * ALIGNED(64))a15)[ 8] = b08.i[15]; - ((int * ALIGNED(64))a15)[ 9] = b09.i[15]; - ((int * ALIGNED(64))a15)[10] = b10.i[15]; - ((int * ALIGNED(64))a15)[11] = b11.i[15]; - ((int * ALIGNED(64))a15)[12] = b12.i[15]; - ((int * ALIGNED(64))a15)[13] = b13.i[15]; - ((int * ALIGNED(64))a15)[14] = b14.i[15]; - ((int * ALIGNED(64))a15)[15] = b15.i[15]; - } - - inline void store_16x8_tr_p( const v16 &b00, - const v16 &b01, - const v16 &b02, - const v16 &b03, - const v16 &b04, - const v16 &b05, - const v16 &b06, - const v16 &b07, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1]; - ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1]; - ((int * ALIGNED(64))a00)[10] = b02.i[ 1]; - ((int * ALIGNED(64))a00)[11] = b03.i[ 1]; - ((int * ALIGNED(64))a00)[12] = b04.i[ 1]; - ((int * ALIGNED(64))a00)[13] = b05.i[ 1]; - ((int * ALIGNED(64))a00)[14] = b06.i[ 1]; - ((int * ALIGNED(64))a00)[15] = b07.i[ 1]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3]; - ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3]; - ((int * ALIGNED(64))a01)[10] = b02.i[ 3]; - ((int * ALIGNED(64))a01)[11] = b03.i[ 3]; - ((int * ALIGNED(64))a01)[12] = b04.i[ 3]; - ((int * ALIGNED(64))a01)[13] = b05.i[ 3]; - ((int * ALIGNED(64))a01)[14] = b06.i[ 3]; - ((int * ALIGNED(64))a01)[15] = b07.i[ 3]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5]; - ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5]; - ((int * ALIGNED(64))a02)[10] = b02.i[ 5]; - ((int * ALIGNED(64))a02)[11] = b03.i[ 5]; - ((int * ALIGNED(64))a02)[12] = b04.i[ 5]; - ((int * ALIGNED(64))a02)[13] = b05.i[ 5]; - ((int * ALIGNED(64))a02)[14] = b06.i[ 5]; - ((int * ALIGNED(64))a02)[15] = b07.i[ 5]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7]; - ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7]; - ((int * ALIGNED(64))a03)[10] = b02.i[ 7]; - ((int * ALIGNED(64))a03)[11] = b03.i[ 7]; - ((int * ALIGNED(64))a03)[12] = b04.i[ 7]; - ((int * ALIGNED(64))a03)[13] = b05.i[ 7]; - ((int * ALIGNED(64))a03)[14] = b06.i[ 7]; - ((int * ALIGNED(64))a03)[15] = b07.i[ 7]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9]; - ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9]; - ((int * ALIGNED(64))a04)[10] = b02.i[ 9]; - ((int * ALIGNED(64))a04)[11] = b03.i[ 9]; - ((int * ALIGNED(64))a04)[12] = b04.i[ 9]; - ((int * ALIGNED(64))a04)[13] = b05.i[ 9]; - ((int * ALIGNED(64))a04)[14] = b06.i[ 9]; - ((int * ALIGNED(64))a04)[15] = b07.i[ 9]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a05)[ 8] = b00.i[11]; - ((int * ALIGNED(64))a05)[ 9] = b01.i[11]; - ((int * ALIGNED(64))a05)[10] = b02.i[11]; - ((int * ALIGNED(64))a05)[11] = b03.i[11]; - ((int * ALIGNED(64))a05)[12] = b04.i[11]; - ((int * ALIGNED(64))a05)[13] = b05.i[11]; - ((int * ALIGNED(64))a05)[14] = b06.i[11]; - ((int * ALIGNED(64))a05)[15] = b07.i[11]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a06)[ 8] = b00.i[13]; - ((int * ALIGNED(64))a06)[ 9] = b01.i[13]; - ((int * ALIGNED(64))a06)[10] = b02.i[13]; - ((int * ALIGNED(64))a06)[11] = b03.i[13]; - ((int * ALIGNED(64))a06)[12] = b04.i[13]; - ((int * ALIGNED(64))a06)[13] = b05.i[13]; - ((int * ALIGNED(64))a06)[14] = b06.i[13]; - ((int * ALIGNED(64))a06)[15] = b07.i[13]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a07)[ 8] = b00.i[15]; - ((int * ALIGNED(64))a07)[ 9] = b01.i[15]; - ((int * ALIGNED(64))a07)[10] = b02.i[15]; - ((int * ALIGNED(64))a07)[11] = b03.i[15]; - ((int * ALIGNED(64))a07)[12] = b04.i[15]; - ((int * ALIGNED(64))a07)[13] = b05.i[15]; - ((int * ALIGNED(64))a07)[14] = b06.i[15]; - ((int * ALIGNED(64))a07)[15] = b07.i[15]; - } - - inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1]; - ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1]; - ((int * ALIGNED(64))a00)[10] = b02.i[ 1]; - ((int * ALIGNED(64))a00)[11] = b03.i[ 1]; - ((int * ALIGNED(64))a00)[12] = b04.i[ 1]; - ((int * ALIGNED(64))a00)[13] = b05.i[ 1]; - ((int * ALIGNED(64))a00)[14] = b06.i[ 1]; - ((int * ALIGNED(64))a00)[15] = b07.i[ 1]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3]; - ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3]; - ((int * ALIGNED(64))a01)[10] = b02.i[ 3]; - ((int * ALIGNED(64))a01)[11] = b03.i[ 3]; - ((int * ALIGNED(64))a01)[12] = b04.i[ 3]; - ((int * ALIGNED(64))a01)[13] = b05.i[ 3]; - ((int * ALIGNED(64))a01)[14] = b06.i[ 3]; - ((int * ALIGNED(64))a01)[15] = b07.i[ 3]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5]; - ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5]; - ((int * ALIGNED(64))a02)[10] = b02.i[ 5]; - ((int * ALIGNED(64))a02)[11] = b03.i[ 5]; - ((int * ALIGNED(64))a02)[12] = b04.i[ 5]; - ((int * ALIGNED(64))a02)[13] = b05.i[ 5]; - ((int * ALIGNED(64))a02)[14] = b06.i[ 5]; - ((int * ALIGNED(64))a02)[15] = b07.i[ 5]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7]; - ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7]; - ((int * ALIGNED(64))a03)[10] = b02.i[ 7]; - ((int * ALIGNED(64))a03)[11] = b03.i[ 7]; - ((int * ALIGNED(64))a03)[12] = b04.i[ 7]; - ((int * ALIGNED(64))a03)[13] = b05.i[ 7]; - ((int * ALIGNED(64))a03)[14] = b06.i[ 7]; - ((int * ALIGNED(64))a03)[15] = b07.i[ 7]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9]; - ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9]; - ((int * ALIGNED(64))a04)[10] = b02.i[ 9]; - ((int * ALIGNED(64))a04)[11] = b03.i[ 9]; - ((int * ALIGNED(64))a04)[12] = b04.i[ 9]; - ((int * ALIGNED(64))a04)[13] = b05.i[ 9]; - ((int * ALIGNED(64))a04)[14] = b06.i[ 9]; - ((int * ALIGNED(64))a04)[15] = b07.i[ 9]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a05)[ 8] = b00.i[11]; - ((int * ALIGNED(64))a05)[ 9] = b01.i[11]; - ((int * ALIGNED(64))a05)[10] = b02.i[11]; - ((int * ALIGNED(64))a05)[11] = b03.i[11]; - ((int * ALIGNED(64))a05)[12] = b04.i[11]; - ((int * ALIGNED(64))a05)[13] = b05.i[11]; - ((int * ALIGNED(64))a05)[14] = b06.i[11]; - ((int * ALIGNED(64))a05)[15] = b07.i[11]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a06)[ 8] = b00.i[13]; - ((int * ALIGNED(64))a06)[ 9] = b01.i[13]; - ((int * ALIGNED(64))a06)[10] = b02.i[13]; - ((int * ALIGNED(64))a06)[11] = b03.i[13]; - ((int * ALIGNED(64))a06)[12] = b04.i[13]; - ((int * ALIGNED(64))a06)[13] = b05.i[13]; - ((int * ALIGNED(64))a06)[14] = b06.i[13]; - ((int * ALIGNED(64))a06)[15] = b07.i[13]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a07)[ 8] = b00.i[15]; - ((int * ALIGNED(64))a07)[ 9] = b01.i[15]; - ((int * ALIGNED(64))a07)[10] = b02.i[15]; - ((int * ALIGNED(64))a07)[11] = b03.i[15]; - ((int * ALIGNED(64))a07)[12] = b04.i[15]; - ((int * ALIGNED(64))a07)[13] = b05.i[15]; - ((int * ALIGNED(64))a07)[14] = b06.i[15]; - ((int * ALIGNED(64))a07)[15] = b07.i[15]; - - ((int * ALIGNED(64))a08)[ 0] = b08.i[ 0]; - ((int * ALIGNED(64))a08)[ 1] = b09.i[ 0]; - ((int * ALIGNED(64))a08)[ 2] = b10.i[ 0]; - ((int * ALIGNED(64))a08)[ 3] = b11.i[ 0]; - ((int * ALIGNED(64))a08)[ 4] = b12.i[ 0]; - ((int * ALIGNED(64))a08)[ 5] = b13.i[ 0]; - ((int * ALIGNED(64))a08)[ 6] = b14.i[ 0]; - ((int * ALIGNED(64))a08)[ 7] = b15.i[ 0]; - ((int * ALIGNED(64))a08)[ 8] = b08.i[ 1]; - ((int * ALIGNED(64))a08)[ 9] = b09.i[ 1]; - ((int * ALIGNED(64))a08)[10] = b10.i[ 1]; - ((int * ALIGNED(64))a08)[11] = b11.i[ 1]; - ((int * ALIGNED(64))a08)[12] = b12.i[ 1]; - ((int * ALIGNED(64))a08)[13] = b13.i[ 1]; - ((int * ALIGNED(64))a08)[14] = b14.i[ 1]; - ((int * ALIGNED(64))a08)[15] = b15.i[ 1]; - - ((int * ALIGNED(64))a09)[ 0] = b08.i[ 2]; - ((int * ALIGNED(64))a09)[ 1] = b09.i[ 2]; - ((int * ALIGNED(64))a09)[ 2] = b10.i[ 2]; - ((int * ALIGNED(64))a09)[ 3] = b11.i[ 2]; - ((int * ALIGNED(64))a09)[ 4] = b12.i[ 2]; - ((int * ALIGNED(64))a09)[ 5] = b13.i[ 2]; - ((int * ALIGNED(64))a09)[ 6] = b14.i[ 2]; - ((int * ALIGNED(64))a09)[ 7] = b15.i[ 2]; - ((int * ALIGNED(64))a09)[ 8] = b08.i[ 3]; - ((int * ALIGNED(64))a09)[ 9] = b09.i[ 3]; - ((int * ALIGNED(64))a09)[10] = b10.i[ 3]; - ((int * ALIGNED(64))a09)[11] = b11.i[ 3]; - ((int * ALIGNED(64))a09)[12] = b12.i[ 3]; - ((int * ALIGNED(64))a09)[13] = b13.i[ 3]; - ((int * ALIGNED(64))a09)[14] = b14.i[ 3]; - ((int * ALIGNED(64))a09)[15] = b15.i[ 3]; - - ((int * ALIGNED(64))a10)[ 0] = b08.i[ 4]; - ((int * ALIGNED(64))a10)[ 1] = b09.i[ 4]; - ((int * ALIGNED(64))a10)[ 2] = b10.i[ 4]; - ((int * ALIGNED(64))a10)[ 3] = b11.i[ 4]; - ((int * ALIGNED(64))a10)[ 4] = b12.i[ 4]; - ((int * ALIGNED(64))a10)[ 5] = b13.i[ 4]; - ((int * ALIGNED(64))a10)[ 6] = b14.i[ 4]; - ((int * ALIGNED(64))a10)[ 7] = b15.i[ 4]; - ((int * ALIGNED(64))a10)[ 8] = b08.i[ 5]; - ((int * ALIGNED(64))a10)[ 9] = b09.i[ 5]; - ((int * ALIGNED(64))a10)[10] = b10.i[ 5]; - ((int * ALIGNED(64))a10)[11] = b11.i[ 5]; - ((int * ALIGNED(64))a10)[12] = b12.i[ 5]; - ((int * ALIGNED(64))a10)[13] = b13.i[ 5]; - ((int * ALIGNED(64))a10)[14] = b14.i[ 5]; - ((int * ALIGNED(64))a10)[15] = b15.i[ 5]; - - ((int * ALIGNED(64))a11)[ 0] = b08.i[ 6]; - ((int * ALIGNED(64))a11)[ 1] = b09.i[ 6]; - ((int * ALIGNED(64))a11)[ 2] = b10.i[ 6]; - ((int * ALIGNED(64))a11)[ 3] = b11.i[ 6]; - ((int * ALIGNED(64))a11)[ 4] = b12.i[ 6]; - ((int * ALIGNED(64))a11)[ 5] = b13.i[ 6]; - ((int * ALIGNED(64))a11)[ 6] = b14.i[ 6]; - ((int * ALIGNED(64))a11)[ 7] = b15.i[ 6]; - ((int * ALIGNED(64))a11)[ 8] = b08.i[ 7]; - ((int * ALIGNED(64))a11)[ 9] = b09.i[ 7]; - ((int * ALIGNED(64))a11)[10] = b10.i[ 7]; - ((int * ALIGNED(64))a11)[11] = b11.i[ 7]; - ((int * ALIGNED(64))a11)[12] = b12.i[ 7]; - ((int * ALIGNED(64))a11)[13] = b13.i[ 7]; - ((int * ALIGNED(64))a11)[14] = b14.i[ 7]; - ((int * ALIGNED(64))a11)[15] = b15.i[ 7]; - - ((int * ALIGNED(64))a12)[ 0] = b08.i[ 8]; - ((int * ALIGNED(64))a12)[ 1] = b09.i[ 8]; - ((int * ALIGNED(64))a12)[ 2] = b10.i[ 8]; - ((int * ALIGNED(64))a12)[ 3] = b11.i[ 8]; - ((int * ALIGNED(64))a12)[ 4] = b12.i[ 8]; - ((int * ALIGNED(64))a12)[ 5] = b13.i[ 8]; - ((int * ALIGNED(64))a12)[ 6] = b14.i[ 8]; - ((int * ALIGNED(64))a12)[ 7] = b15.i[ 8]; - ((int * ALIGNED(64))a12)[ 8] = b08.i[ 9]; - ((int * ALIGNED(64))a12)[ 9] = b09.i[ 9]; - ((int * ALIGNED(64))a12)[10] = b10.i[ 9]; - ((int * ALIGNED(64))a12)[11] = b11.i[ 9]; - ((int * ALIGNED(64))a12)[12] = b12.i[ 9]; - ((int * ALIGNED(64))a12)[13] = b13.i[ 9]; - ((int * ALIGNED(64))a12)[14] = b14.i[ 9]; - ((int * ALIGNED(64))a12)[15] = b15.i[ 9]; - - ((int * ALIGNED(64))a13)[ 0] = b08.i[10]; - ((int * ALIGNED(64))a13)[ 1] = b09.i[10]; - ((int * ALIGNED(64))a13)[ 2] = b10.i[10]; - ((int * ALIGNED(64))a13)[ 3] = b11.i[10]; - ((int * ALIGNED(64))a13)[ 4] = b12.i[10]; - ((int * ALIGNED(64))a13)[ 5] = b13.i[10]; - ((int * ALIGNED(64))a13)[ 6] = b14.i[10]; - ((int * ALIGNED(64))a13)[ 7] = b15.i[10]; - ((int * ALIGNED(64))a13)[ 8] = b08.i[11]; - ((int * ALIGNED(64))a13)[ 9] = b09.i[11]; - ((int * ALIGNED(64))a13)[10] = b10.i[11]; - ((int * ALIGNED(64))a13)[11] = b11.i[11]; - ((int * ALIGNED(64))a13)[12] = b12.i[11]; - ((int * ALIGNED(64))a13)[13] = b13.i[11]; - ((int * ALIGNED(64))a13)[14] = b14.i[11]; - ((int * ALIGNED(64))a13)[15] = b15.i[11]; - - ((int * ALIGNED(64))a14)[ 0] = b08.i[12]; - ((int * ALIGNED(64))a14)[ 1] = b09.i[12]; - ((int * ALIGNED(64))a14)[ 2] = b10.i[12]; - ((int * ALIGNED(64))a14)[ 3] = b11.i[12]; - ((int * ALIGNED(64))a14)[ 4] = b12.i[12]; - ((int * ALIGNED(64))a14)[ 5] = b13.i[12]; - ((int * ALIGNED(64))a14)[ 6] = b14.i[12]; - ((int * ALIGNED(64))a14)[ 7] = b15.i[12]; - ((int * ALIGNED(64))a14)[ 8] = b08.i[13]; - ((int * ALIGNED(64))a14)[ 9] = b09.i[13]; - ((int * ALIGNED(64))a14)[10] = b10.i[13]; - ((int * ALIGNED(64))a14)[11] = b11.i[13]; - ((int * ALIGNED(64))a14)[12] = b12.i[13]; - ((int * ALIGNED(64))a14)[13] = b13.i[13]; - ((int * ALIGNED(64))a14)[14] = b14.i[13]; - ((int * ALIGNED(64))a14)[15] = b15.i[13]; - - ((int * ALIGNED(64))a15)[ 0] = b08.i[14]; - ((int * ALIGNED(64))a15)[ 1] = b09.i[14]; - ((int * ALIGNED(64))a15)[ 2] = b10.i[14]; - ((int * ALIGNED(64))a15)[ 3] = b11.i[14]; - ((int * ALIGNED(64))a15)[ 4] = b12.i[14]; - ((int * ALIGNED(64))a15)[ 5] = b13.i[14]; - ((int * ALIGNED(64))a15)[ 6] = b14.i[14]; - ((int * ALIGNED(64))a15)[ 7] = b15.i[14]; - ((int * ALIGNED(64))a15)[ 8] = b08.i[15]; - ((int * ALIGNED(64))a15)[ 9] = b09.i[15]; - ((int * ALIGNED(64))a15)[10] = b10.i[15]; - ((int * ALIGNED(64))a15)[11] = b11.i[15]; - ((int * ALIGNED(64))a15)[12] = b12.i[15]; - ((int * ALIGNED(64))a15)[13] = b13.i[15]; - ((int * ALIGNED(64))a15)[14] = b14.i[15]; - ((int * ALIGNED(64))a15)[15] = b15.i[15]; - } - - ////////////// - // v16int class - - class v16int : public v16 - { + t = ( (int* ALIGNED( 64 ))a )[0]; + ( (int* ALIGNED( 64 ))a )[0] = ( (int* ALIGNED( 64 ))b )[0]; + ( (int* ALIGNED( 64 ))b )[0] = t; + + t = ( (int* ALIGNED( 64 ))a )[1]; + ( (int* ALIGNED( 64 ))a )[1] = ( (int* ALIGNED( 64 ))b )[1]; + ( (int* ALIGNED( 64 ))b )[1] = t; + + t = ( (int* ALIGNED( 64 ))a )[2]; + ( (int* ALIGNED( 64 ))a )[2] = ( (int* ALIGNED( 64 ))b )[2]; + ( (int* ALIGNED( 64 ))b )[2] = t; + + t = ( (int* ALIGNED( 64 ))a )[3]; + ( (int* ALIGNED( 64 ))a )[3] = ( (int* ALIGNED( 64 ))b )[3]; + ( (int* ALIGNED( 64 ))b )[3] = t; + + t = ( (int* ALIGNED( 64 ))a )[4]; + ( (int* ALIGNED( 64 ))a )[4] = ( (int* ALIGNED( 64 ))b )[4]; + ( (int* ALIGNED( 64 ))b )[4] = t; + + t = ( (int* ALIGNED( 64 ))a )[5]; + ( (int* ALIGNED( 64 ))a )[5] = ( (int* ALIGNED( 64 ))b )[5]; + ( (int* ALIGNED( 64 ))b )[5] = t; + + t = ( (int* ALIGNED( 64 ))a )[6]; + ( (int* ALIGNED( 64 ))a )[6] = ( (int* ALIGNED( 64 ))b )[6]; + ( (int* ALIGNED( 64 ))b )[6] = t; + + t = ( (int* ALIGNED( 64 ))a )[7]; + ( (int* ALIGNED( 64 ))a )[7] = ( (int* ALIGNED( 64 ))b )[7]; + ( (int* ALIGNED( 64 ))b )[7] = t; + + t = ( (int* ALIGNED( 64 ))a )[8]; + ( (int* ALIGNED( 64 ))a )[8] = ( (int* ALIGNED( 64 ))b )[8]; + ( (int* ALIGNED( 64 ))b )[8] = t; + + t = ( (int* ALIGNED( 64 ))a )[9]; + ( (int* ALIGNED( 64 ))a )[9] = ( (int* ALIGNED( 64 ))b )[9]; + ( (int* ALIGNED( 64 ))b )[9] = t; + + t = ( (int* ALIGNED( 64 ))a )[10]; + ( (int* ALIGNED( 64 ))a )[10] = ( (int* ALIGNED( 64 ))b )[10]; + ( (int* ALIGNED( 64 ))b )[10] = t; + + t = ( (int* ALIGNED( 64 ))a )[11]; + ( (int* ALIGNED( 64 ))a )[11] = ( (int* ALIGNED( 64 ))b )[11]; + ( (int* ALIGNED( 64 ))b )[11] = t; + + t = ( (int* ALIGNED( 64 ))a )[12]; + ( (int* ALIGNED( 64 ))a )[12] = ( (int* ALIGNED( 64 ))b )[12]; + ( (int* ALIGNED( 64 ))b )[12] = t; + + t = ( (int* ALIGNED( 64 ))a )[13]; + ( (int* ALIGNED( 64 ))a )[13] = ( (int* ALIGNED( 64 ))b )[13]; + ( (int* ALIGNED( 64 ))b )[13] = t; + + t = ( (int* ALIGNED( 64 ))a )[14]; + ( (int* ALIGNED( 64 ))a )[14] = ( (int* ALIGNED( 64 ))b )[14]; + ( (int* ALIGNED( 64 ))b )[14] = t; + + t = ( (int* ALIGNED( 64 ))a )[15]; + ( (int* ALIGNED( 64 ))a )[15] = ( (int* ALIGNED( 64 ))b )[15]; + ( (int* ALIGNED( 64 ))b )[15] = t; +} + +// v16 transposed memory manipulation functions + +inline void load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) +{ + a.i[0] = ( (const int*)a00 )[0]; + a.i[1] = ( (const int*)a01 )[0]; + a.i[2] = ( (const int*)a02 )[0]; + a.i[3] = ( (const int*)a03 )[0]; + a.i[4] = ( (const int*)a04 )[0]; + a.i[5] = ( (const int*)a05 )[0]; + a.i[6] = ( (const int*)a06 )[0]; + a.i[7] = ( (const int*)a07 )[0]; + a.i[8] = ( (const int*)a08 )[0]; + a.i[9] = ( (const int*)a09 )[0]; + a.i[10] = ( (const int*)a10 )[0]; + a.i[11] = ( (const int*)a11 )[0]; + a.i[12] = ( (const int*)a12 )[0]; + a.i[13] = ( (const int*)a13 )[0]; + a.i[14] = ( (const int*)a14 )[0]; + a.i[15] = ( (const int*)a15 )[0]; +} + +inline void +load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a00 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a01 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a02 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a03 )[1]; + + a.i[4] = ( (const int* ALIGNED( 8 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 8 ))a04 )[1]; + + a.i[5] = ( (const int* ALIGNED( 8 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 8 ))a05 )[1]; + + a.i[6] = ( (const int* ALIGNED( 8 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 8 ))a06 )[1]; + + a.i[7] = ( (const int* ALIGNED( 8 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 8 ))a07 )[1]; + + a.i[8] = ( (const int* ALIGNED( 8 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 8 ))a08 )[1]; + + a.i[9] = ( (const int* ALIGNED( 8 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 8 ))a09 )[1]; + + a.i[10] = ( (const int* ALIGNED( 8 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 8 ))a10 )[1]; + + a.i[11] = ( (const int* ALIGNED( 8 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 8 ))a11 )[1]; + + a.i[12] = ( (const int* ALIGNED( 8 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 8 ))a12 )[1]; + + a.i[13] = ( (const int* ALIGNED( 8 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 8 ))a13 )[1]; + + a.i[14] = ( (const int* ALIGNED( 8 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 8 ))a14 )[1]; + + a.i[15] = ( (const int* ALIGNED( 8 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 8 ))a15 )[1]; +} + +inline void +load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; +} + +inline void +load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + d.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + d.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + d.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + d.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + d.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + d.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + d.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + d.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + d.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + d.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + d.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + d.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + d.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + d.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + d.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + d.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; +} + +inline void +load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + d.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + e.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + f.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + g.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + h.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + d.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + e.i[1] = ( (const int* ALIGNED( 64 ))a01 )[4]; + f.i[1] = ( (const int* ALIGNED( 64 ))a01 )[5]; + g.i[1] = ( (const int* ALIGNED( 64 ))a01 )[6]; + h.i[1] = ( (const int* ALIGNED( 64 ))a01 )[7]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + d.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + e.i[2] = ( (const int* ALIGNED( 64 ))a02 )[4]; + f.i[2] = ( (const int* ALIGNED( 64 ))a02 )[5]; + g.i[2] = ( (const int* ALIGNED( 64 ))a02 )[6]; + h.i[2] = ( (const int* ALIGNED( 64 ))a02 )[7]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + d.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + e.i[3] = ( (const int* ALIGNED( 64 ))a03 )[4]; + f.i[3] = ( (const int* ALIGNED( 64 ))a03 )[5]; + g.i[3] = ( (const int* ALIGNED( 64 ))a03 )[6]; + h.i[3] = ( (const int* ALIGNED( 64 ))a03 )[7]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + d.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + e.i[4] = ( (const int* ALIGNED( 64 ))a04 )[4]; + f.i[4] = ( (const int* ALIGNED( 64 ))a04 )[5]; + g.i[4] = ( (const int* ALIGNED( 64 ))a04 )[6]; + h.i[4] = ( (const int* ALIGNED( 64 ))a04 )[7]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + d.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + e.i[5] = ( (const int* ALIGNED( 64 ))a05 )[4]; + f.i[5] = ( (const int* ALIGNED( 64 ))a05 )[5]; + g.i[5] = ( (const int* ALIGNED( 64 ))a05 )[6]; + h.i[5] = ( (const int* ALIGNED( 64 ))a05 )[7]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + d.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + e.i[6] = ( (const int* ALIGNED( 64 ))a06 )[4]; + f.i[6] = ( (const int* ALIGNED( 64 ))a06 )[5]; + g.i[6] = ( (const int* ALIGNED( 64 ))a06 )[6]; + h.i[6] = ( (const int* ALIGNED( 64 ))a06 )[7]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + d.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + e.i[7] = ( (const int* ALIGNED( 64 ))a07 )[4]; + f.i[7] = ( (const int* ALIGNED( 64 ))a07 )[5]; + g.i[7] = ( (const int* ALIGNED( 64 ))a07 )[6]; + h.i[7] = ( (const int* ALIGNED( 64 ))a07 )[7]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + d.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + e.i[8] = ( (const int* ALIGNED( 64 ))a08 )[4]; + f.i[8] = ( (const int* ALIGNED( 64 ))a08 )[5]; + g.i[8] = ( (const int* ALIGNED( 64 ))a08 )[6]; + h.i[8] = ( (const int* ALIGNED( 64 ))a08 )[7]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + d.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + e.i[9] = ( (const int* ALIGNED( 64 ))a09 )[4]; + f.i[9] = ( (const int* ALIGNED( 64 ))a09 )[5]; + g.i[9] = ( (const int* ALIGNED( 64 ))a09 )[6]; + h.i[9] = ( (const int* ALIGNED( 64 ))a09 )[7]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + d.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + e.i[10] = ( (const int* ALIGNED( 64 ))a10 )[4]; + f.i[10] = ( (const int* ALIGNED( 64 ))a10 )[5]; + g.i[10] = ( (const int* ALIGNED( 64 ))a10 )[6]; + h.i[10] = ( (const int* ALIGNED( 64 ))a10 )[7]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + d.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + e.i[11] = ( (const int* ALIGNED( 64 ))a11 )[4]; + f.i[11] = ( (const int* ALIGNED( 64 ))a11 )[5]; + g.i[11] = ( (const int* ALIGNED( 64 ))a11 )[6]; + h.i[11] = ( (const int* ALIGNED( 64 ))a11 )[7]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + d.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + e.i[12] = ( (const int* ALIGNED( 64 ))a12 )[4]; + f.i[12] = ( (const int* ALIGNED( 64 ))a12 )[5]; + g.i[12] = ( (const int* ALIGNED( 64 ))a12 )[6]; + h.i[12] = ( (const int* ALIGNED( 64 ))a12 )[7]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + d.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + e.i[13] = ( (const int* ALIGNED( 64 ))a13 )[4]; + f.i[13] = ( (const int* ALIGNED( 64 ))a13 )[5]; + g.i[13] = ( (const int* ALIGNED( 64 ))a13 )[6]; + h.i[13] = ( (const int* ALIGNED( 64 ))a13 )[7]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + d.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + e.i[14] = ( (const int* ALIGNED( 64 ))a14 )[4]; + f.i[14] = ( (const int* ALIGNED( 64 ))a14 )[5]; + g.i[14] = ( (const int* ALIGNED( 64 ))a14 )[6]; + h.i[14] = ( (const int* ALIGNED( 64 ))a14 )[7]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + d.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; + e.i[15] = ( (const int* ALIGNED( 64 ))a15 )[4]; + f.i[15] = ( (const int* ALIGNED( 64 ))a15 )[5]; + g.i[15] = ( (const int* ALIGNED( 64 ))a15 )[6]; + h.i[15] = ( (const int* ALIGNED( 64 ))a15 )[7]; +} + +inline void +load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b08.i[0] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b09.i[0] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b10.i[0] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b11.i[0] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b12.i[0] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b13.i[0] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b14.i[0] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b15.i[0] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b08.i[1] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b09.i[1] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b10.i[1] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b11.i[1] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b12.i[1] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b13.i[1] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b14.i[1] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b15.i[1] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b08.i[2] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b09.i[2] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b10.i[2] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b11.i[2] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b12.i[2] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b13.i[2] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b14.i[2] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b15.i[2] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b08.i[3] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b09.i[3] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b10.i[3] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b11.i[3] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b12.i[3] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b13.i[3] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b14.i[3] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b15.i[3] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b08.i[4] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b09.i[4] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b10.i[4] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b11.i[4] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b12.i[4] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b13.i[4] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b14.i[4] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b15.i[4] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b08.i[5] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b09.i[5] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b10.i[5] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b11.i[5] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b12.i[5] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b13.i[5] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b14.i[5] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b15.i[5] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b08.i[6] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b09.i[6] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b10.i[6] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b11.i[6] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b12.i[6] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b13.i[6] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b14.i[6] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b15.i[6] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b08.i[7] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b09.i[7] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b10.i[7] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b11.i[7] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b12.i[7] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b13.i[7] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b14.i[7] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b15.i[7] = ( (const int* ALIGNED( 64 ))a07 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a08 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a08 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a08 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a08 )[7]; + b08.i[8] = ( (const int* ALIGNED( 64 ))a08 )[8]; + b09.i[8] = ( (const int* ALIGNED( 64 ))a08 )[9]; + b10.i[8] = ( (const int* ALIGNED( 64 ))a08 )[10]; + b11.i[8] = ( (const int* ALIGNED( 64 ))a08 )[11]; + b12.i[8] = ( (const int* ALIGNED( 64 ))a08 )[12]; + b13.i[8] = ( (const int* ALIGNED( 64 ))a08 )[13]; + b14.i[8] = ( (const int* ALIGNED( 64 ))a08 )[14]; + b15.i[8] = ( (const int* ALIGNED( 64 ))a08 )[15]; + + b00.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a09 )[4]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a09 )[5]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a09 )[6]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a09 )[7]; + b08.i[9] = ( (const int* ALIGNED( 64 ))a09 )[8]; + b09.i[9] = ( (const int* ALIGNED( 64 ))a09 )[9]; + b10.i[9] = ( (const int* ALIGNED( 64 ))a09 )[10]; + b11.i[9] = ( (const int* ALIGNED( 64 ))a09 )[11]; + b12.i[9] = ( (const int* ALIGNED( 64 ))a09 )[12]; + b13.i[9] = ( (const int* ALIGNED( 64 ))a09 )[13]; + b14.i[9] = ( (const int* ALIGNED( 64 ))a09 )[14]; + b15.i[9] = ( (const int* ALIGNED( 64 ))a09 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a10 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a10 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a10 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a10 )[7]; + b08.i[10] = ( (const int* ALIGNED( 64 ))a10 )[8]; + b09.i[10] = ( (const int* ALIGNED( 64 ))a10 )[9]; + b10.i[10] = ( (const int* ALIGNED( 64 ))a10 )[10]; + b11.i[10] = ( (const int* ALIGNED( 64 ))a10 )[11]; + b12.i[10] = ( (const int* ALIGNED( 64 ))a10 )[12]; + b13.i[10] = ( (const int* ALIGNED( 64 ))a10 )[13]; + b14.i[10] = ( (const int* ALIGNED( 64 ))a10 )[14]; + b15.i[10] = ( (const int* ALIGNED( 64 ))a10 )[15]; + + b00.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a11 )[4]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a11 )[5]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a11 )[6]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a11 )[7]; + b08.i[11] = ( (const int* ALIGNED( 64 ))a11 )[8]; + b09.i[11] = ( (const int* ALIGNED( 64 ))a11 )[9]; + b10.i[11] = ( (const int* ALIGNED( 64 ))a11 )[10]; + b11.i[11] = ( (const int* ALIGNED( 64 ))a11 )[11]; + b12.i[11] = ( (const int* ALIGNED( 64 ))a11 )[12]; + b13.i[11] = ( (const int* ALIGNED( 64 ))a11 )[13]; + b14.i[11] = ( (const int* ALIGNED( 64 ))a11 )[14]; + b15.i[11] = ( (const int* ALIGNED( 64 ))a11 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a12 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a12 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a12 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a12 )[7]; + b08.i[12] = ( (const int* ALIGNED( 64 ))a12 )[8]; + b09.i[12] = ( (const int* ALIGNED( 64 ))a12 )[9]; + b10.i[12] = ( (const int* ALIGNED( 64 ))a12 )[10]; + b11.i[12] = ( (const int* ALIGNED( 64 ))a12 )[11]; + b12.i[12] = ( (const int* ALIGNED( 64 ))a12 )[12]; + b13.i[12] = ( (const int* ALIGNED( 64 ))a12 )[13]; + b14.i[12] = ( (const int* ALIGNED( 64 ))a12 )[14]; + b15.i[12] = ( (const int* ALIGNED( 64 ))a12 )[15]; + + b00.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a13 )[4]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a13 )[5]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a13 )[6]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a13 )[7]; + b08.i[13] = ( (const int* ALIGNED( 64 ))a13 )[8]; + b09.i[13] = ( (const int* ALIGNED( 64 ))a13 )[9]; + b10.i[13] = ( (const int* ALIGNED( 64 ))a13 )[10]; + b11.i[13] = ( (const int* ALIGNED( 64 ))a13 )[11]; + b12.i[13] = ( (const int* ALIGNED( 64 ))a13 )[12]; + b13.i[13] = ( (const int* ALIGNED( 64 ))a13 )[13]; + b14.i[13] = ( (const int* ALIGNED( 64 ))a13 )[14]; + b15.i[13] = ( (const int* ALIGNED( 64 ))a13 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a14 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a14 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a14 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a14 )[7]; + b08.i[14] = ( (const int* ALIGNED( 64 ))a14 )[8]; + b09.i[14] = ( (const int* ALIGNED( 64 ))a14 )[9]; + b10.i[14] = ( (const int* ALIGNED( 64 ))a14 )[10]; + b11.i[14] = ( (const int* ALIGNED( 64 ))a14 )[11]; + b12.i[14] = ( (const int* ALIGNED( 64 ))a14 )[12]; + b13.i[14] = ( (const int* ALIGNED( 64 ))a14 )[13]; + b14.i[14] = ( (const int* ALIGNED( 64 ))a14 )[14]; + b15.i[14] = ( (const int* ALIGNED( 64 ))a14 )[15]; + + b00.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a15 )[4]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a15 )[5]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a15 )[6]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a15 )[7]; + b08.i[15] = ( (const int* ALIGNED( 64 ))a15 )[8]; + b09.i[15] = ( (const int* ALIGNED( 64 ))a15 )[9]; + b10.i[15] = ( (const int* ALIGNED( 64 ))a15 )[10]; + b11.i[15] = ( (const int* ALIGNED( 64 ))a15 )[11]; + b12.i[15] = ( (const int* ALIGNED( 64 ))a15 )[12]; + b13.i[15] = ( (const int* ALIGNED( 64 ))a15 )[13]; + b14.i[15] = ( (const int* ALIGNED( 64 ))a15 )[14]; + b15.i[15] = ( (const int* ALIGNED( 64 ))a15 )[15]; +} + +inline void +load_16x8_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b00.i[1] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b00.i[3] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b00.i[5] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b00.i[7] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b00.i[9] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b00.i[11] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b00.i[13] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b00.i[15] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a07 )[15]; +} + +inline void +load_16x16_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b00.i[1] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b00.i[3] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b00.i[5] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b00.i[7] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b00.i[9] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b00.i[11] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b00.i[13] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b00.i[15] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a07 )[15]; + + b08.i[0] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b09.i[0] = ( (const int* ALIGNED( 64 ))a08 )[1]; + b10.i[0] = ( (const int* ALIGNED( 64 ))a08 )[2]; + b11.i[0] = ( (const int* ALIGNED( 64 ))a08 )[3]; + b12.i[0] = ( (const int* ALIGNED( 64 ))a08 )[4]; + b13.i[0] = ( (const int* ALIGNED( 64 ))a08 )[5]; + b14.i[0] = ( (const int* ALIGNED( 64 ))a08 )[6]; + b15.i[0] = ( (const int* ALIGNED( 64 ))a08 )[7]; + b08.i[1] = ( (const int* ALIGNED( 64 ))a08 )[8]; + b09.i[1] = ( (const int* ALIGNED( 64 ))a08 )[9]; + b10.i[1] = ( (const int* ALIGNED( 64 ))a08 )[10]; + b11.i[1] = ( (const int* ALIGNED( 64 ))a08 )[11]; + b12.i[1] = ( (const int* ALIGNED( 64 ))a08 )[12]; + b13.i[1] = ( (const int* ALIGNED( 64 ))a08 )[13]; + b14.i[1] = ( (const int* ALIGNED( 64 ))a08 )[14]; + b15.i[1] = ( (const int* ALIGNED( 64 ))a08 )[15]; + + b08.i[2] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b09.i[2] = ( (const int* ALIGNED( 64 ))a09 )[1]; + b10.i[2] = ( (const int* ALIGNED( 64 ))a09 )[2]; + b11.i[2] = ( (const int* ALIGNED( 64 ))a09 )[3]; + b12.i[2] = ( (const int* ALIGNED( 64 ))a09 )[4]; + b13.i[2] = ( (const int* ALIGNED( 64 ))a09 )[5]; + b14.i[2] = ( (const int* ALIGNED( 64 ))a09 )[6]; + b15.i[2] = ( (const int* ALIGNED( 64 ))a09 )[7]; + b08.i[3] = ( (const int* ALIGNED( 64 ))a09 )[8]; + b09.i[3] = ( (const int* ALIGNED( 64 ))a09 )[9]; + b10.i[3] = ( (const int* ALIGNED( 64 ))a09 )[10]; + b11.i[3] = ( (const int* ALIGNED( 64 ))a09 )[11]; + b12.i[3] = ( (const int* ALIGNED( 64 ))a09 )[12]; + b13.i[3] = ( (const int* ALIGNED( 64 ))a09 )[13]; + b14.i[3] = ( (const int* ALIGNED( 64 ))a09 )[14]; + b15.i[3] = ( (const int* ALIGNED( 64 ))a09 )[15]; + + b08.i[4] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b09.i[4] = ( (const int* ALIGNED( 64 ))a10 )[1]; + b10.i[4] = ( (const int* ALIGNED( 64 ))a10 )[2]; + b11.i[4] = ( (const int* ALIGNED( 64 ))a10 )[3]; + b12.i[4] = ( (const int* ALIGNED( 64 ))a10 )[4]; + b13.i[4] = ( (const int* ALIGNED( 64 ))a10 )[5]; + b14.i[4] = ( (const int* ALIGNED( 64 ))a10 )[6]; + b15.i[4] = ( (const int* ALIGNED( 64 ))a10 )[7]; + b08.i[5] = ( (const int* ALIGNED( 64 ))a10 )[8]; + b09.i[5] = ( (const int* ALIGNED( 64 ))a10 )[9]; + b10.i[5] = ( (const int* ALIGNED( 64 ))a10 )[10]; + b11.i[5] = ( (const int* ALIGNED( 64 ))a10 )[11]; + b12.i[5] = ( (const int* ALIGNED( 64 ))a10 )[12]; + b13.i[5] = ( (const int* ALIGNED( 64 ))a10 )[13]; + b14.i[5] = ( (const int* ALIGNED( 64 ))a10 )[14]; + b15.i[5] = ( (const int* ALIGNED( 64 ))a10 )[15]; + + b08.i[6] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b09.i[6] = ( (const int* ALIGNED( 64 ))a11 )[1]; + b10.i[6] = ( (const int* ALIGNED( 64 ))a11 )[2]; + b11.i[6] = ( (const int* ALIGNED( 64 ))a11 )[3]; + b12.i[6] = ( (const int* ALIGNED( 64 ))a11 )[4]; + b13.i[6] = ( (const int* ALIGNED( 64 ))a11 )[5]; + b14.i[6] = ( (const int* ALIGNED( 64 ))a11 )[6]; + b15.i[6] = ( (const int* ALIGNED( 64 ))a11 )[7]; + b08.i[7] = ( (const int* ALIGNED( 64 ))a11 )[8]; + b09.i[7] = ( (const int* ALIGNED( 64 ))a11 )[9]; + b10.i[7] = ( (const int* ALIGNED( 64 ))a11 )[10]; + b11.i[7] = ( (const int* ALIGNED( 64 ))a11 )[11]; + b12.i[7] = ( (const int* ALIGNED( 64 ))a11 )[12]; + b13.i[7] = ( (const int* ALIGNED( 64 ))a11 )[13]; + b14.i[7] = ( (const int* ALIGNED( 64 ))a11 )[14]; + b15.i[7] = ( (const int* ALIGNED( 64 ))a11 )[15]; + + b08.i[8] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b09.i[8] = ( (const int* ALIGNED( 64 ))a12 )[1]; + b10.i[8] = ( (const int* ALIGNED( 64 ))a12 )[2]; + b11.i[8] = ( (const int* ALIGNED( 64 ))a12 )[3]; + b12.i[8] = ( (const int* ALIGNED( 64 ))a12 )[4]; + b13.i[8] = ( (const int* ALIGNED( 64 ))a12 )[5]; + b14.i[8] = ( (const int* ALIGNED( 64 ))a12 )[6]; + b15.i[8] = ( (const int* ALIGNED( 64 ))a12 )[7]; + b08.i[9] = ( (const int* ALIGNED( 64 ))a12 )[8]; + b09.i[9] = ( (const int* ALIGNED( 64 ))a12 )[9]; + b10.i[9] = ( (const int* ALIGNED( 64 ))a12 )[10]; + b11.i[9] = ( (const int* ALIGNED( 64 ))a12 )[11]; + b12.i[9] = ( (const int* ALIGNED( 64 ))a12 )[12]; + b13.i[9] = ( (const int* ALIGNED( 64 ))a12 )[13]; + b14.i[9] = ( (const int* ALIGNED( 64 ))a12 )[14]; + b15.i[9] = ( (const int* ALIGNED( 64 ))a12 )[15]; + + b08.i[10] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b09.i[10] = ( (const int* ALIGNED( 64 ))a13 )[1]; + b10.i[10] = ( (const int* ALIGNED( 64 ))a13 )[2]; + b11.i[10] = ( (const int* ALIGNED( 64 ))a13 )[3]; + b12.i[10] = ( (const int* ALIGNED( 64 ))a13 )[4]; + b13.i[10] = ( (const int* ALIGNED( 64 ))a13 )[5]; + b14.i[10] = ( (const int* ALIGNED( 64 ))a13 )[6]; + b15.i[10] = ( (const int* ALIGNED( 64 ))a13 )[7]; + b08.i[11] = ( (const int* ALIGNED( 64 ))a13 )[8]; + b09.i[11] = ( (const int* ALIGNED( 64 ))a13 )[9]; + b10.i[11] = ( (const int* ALIGNED( 64 ))a13 )[10]; + b11.i[11] = ( (const int* ALIGNED( 64 ))a13 )[11]; + b12.i[11] = ( (const int* ALIGNED( 64 ))a13 )[12]; + b13.i[11] = ( (const int* ALIGNED( 64 ))a13 )[13]; + b14.i[11] = ( (const int* ALIGNED( 64 ))a13 )[14]; + b15.i[11] = ( (const int* ALIGNED( 64 ))a13 )[15]; + + b08.i[12] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b09.i[12] = ( (const int* ALIGNED( 64 ))a14 )[1]; + b10.i[12] = ( (const int* ALIGNED( 64 ))a14 )[2]; + b11.i[12] = ( (const int* ALIGNED( 64 ))a14 )[3]; + b12.i[12] = ( (const int* ALIGNED( 64 ))a14 )[4]; + b13.i[12] = ( (const int* ALIGNED( 64 ))a14 )[5]; + b14.i[12] = ( (const int* ALIGNED( 64 ))a14 )[6]; + b15.i[12] = ( (const int* ALIGNED( 64 ))a14 )[7]; + b08.i[13] = ( (const int* ALIGNED( 64 ))a14 )[8]; + b09.i[13] = ( (const int* ALIGNED( 64 ))a14 )[9]; + b10.i[13] = ( (const int* ALIGNED( 64 ))a14 )[10]; + b11.i[13] = ( (const int* ALIGNED( 64 ))a14 )[11]; + b12.i[13] = ( (const int* ALIGNED( 64 ))a14 )[12]; + b13.i[13] = ( (const int* ALIGNED( 64 ))a14 )[13]; + b14.i[13] = ( (const int* ALIGNED( 64 ))a14 )[14]; + b15.i[13] = ( (const int* ALIGNED( 64 ))a14 )[15]; + + b08.i[14] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b09.i[14] = ( (const int* ALIGNED( 64 ))a15 )[1]; + b10.i[14] = ( (const int* ALIGNED( 64 ))a15 )[2]; + b11.i[14] = ( (const int* ALIGNED( 64 ))a15 )[3]; + b12.i[14] = ( (const int* ALIGNED( 64 ))a15 )[4]; + b13.i[14] = ( (const int* ALIGNED( 64 ))a15 )[5]; + b14.i[14] = ( (const int* ALIGNED( 64 ))a15 )[6]; + b15.i[14] = ( (const int* ALIGNED( 64 ))a15 )[7]; + b08.i[15] = ( (const int* ALIGNED( 64 ))a15 )[8]; + b09.i[15] = ( (const int* ALIGNED( 64 ))a15 )[9]; + b10.i[15] = ( (const int* ALIGNED( 64 ))a15 )[10]; + b11.i[15] = ( (const int* ALIGNED( 64 ))a15 )[11]; + b12.i[15] = ( (const int* ALIGNED( 64 ))a15 )[12]; + b13.i[15] = ( (const int* ALIGNED( 64 ))a15 )[13]; + b14.i[15] = ( (const int* ALIGNED( 64 ))a15 )[14]; + b15.i[15] = ( (const int* ALIGNED( 64 ))a15 )[15]; +} + +inline void store_16x1_tr( const v16& a, void* a00, void* a01, void* a02, + void* a03, void* a04, void* a05, void* a06, + void* a07, void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, void* a14, + void* a15 ) +{ + ( (int*)a00 )[0] = a.i[0]; + ( (int*)a01 )[0] = a.i[1]; + ( (int*)a02 )[0] = a.i[2]; + ( (int*)a03 )[0] = a.i[3]; + ( (int*)a04 )[0] = a.i[4]; + ( (int*)a05 )[0] = a.i[5]; + ( (int*)a06 )[0] = a.i[6]; + ( (int*)a07 )[0] = a.i[7]; + ( (int*)a08 )[0] = a.i[8]; + ( (int*)a09 )[0] = a.i[9]; + ( (int*)a10 )[0] = a.i[10]; + ( (int*)a11 )[0] = a.i[11]; + ( (int*)a12 )[0] = a.i[12]; + ( (int*)a13 )[0] = a.i[13]; + ( (int*)a14 )[0] = a.i[14]; + ( (int*)a15 )[0] = a.i[15]; +} + +inline void store_16x2_tr( const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, + void* ALIGNED( 8 ) a03, void* ALIGNED( 8 ) a04, + void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, + void* ALIGNED( 8 ) a09, void* ALIGNED( 8 ) a10, + void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) +{ + ( (int* ALIGNED( 8 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a00 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a01 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a02 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a03 )[1] = b.i[3]; + + ( (int* ALIGNED( 8 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a04 )[1] = b.i[4]; + + ( (int* ALIGNED( 8 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a05 )[1] = b.i[5]; + + ( (int* ALIGNED( 8 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a06 )[1] = b.i[6]; + + ( (int* ALIGNED( 8 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a07 )[1] = b.i[7]; + + ( (int* ALIGNED( 8 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 8 ))a08 )[1] = b.i[8]; + + ( (int* ALIGNED( 8 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 8 ))a09 )[1] = b.i[9]; + + ( (int* ALIGNED( 8 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 8 ))a10 )[1] = b.i[10]; + + ( (int* ALIGNED( 8 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 8 ))a11 )[1] = b.i[11]; + + ( (int* ALIGNED( 8 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 8 ))a12 )[1] = b.i[12]; + + ( (int* ALIGNED( 8 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 8 ))a13 )[1] = b.i[13]; + + ( (int* ALIGNED( 8 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 8 ))a14 )[1] = b.i[14]; + + ( (int* ALIGNED( 8 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 8 ))a15 )[1] = b.i[15]; +} + +inline void store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; +} + +inline void store_16x4_tr( const v16& a, const v16& b, const v16& c, + const v16& d, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, + void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, + void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; +} + +inline void store_16x8_tr( + const v16& a, const v16& b, const v16& c, const v16& d, const v16& e, + const v16& f, const v16& g, const v16& h, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = e.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = f.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = g.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = h.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = e.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = f.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = g.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = h.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = e.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = f.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = g.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = h.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = e.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = f.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = g.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = h.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = e.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = f.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = g.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = h.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = e.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = f.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = g.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = h.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = e.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = f.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = g.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = h.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = e.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = f.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = g.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = h.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = e.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = f.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = g.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = h.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = e.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = f.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = g.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = h.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = e.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = f.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = g.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = h.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = e.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = f.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = g.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = h.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = e.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = f.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = g.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = h.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = e.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = f.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = g.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = h.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = e.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = f.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = g.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = h.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = e.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = f.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = g.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = h.i[15]; +} + +inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b08.i[0]; + ( (int* ALIGNED( 64 ))a00 )[9] = b09.i[0]; + ( (int* ALIGNED( 64 ))a00 )[10] = b10.i[0]; + ( (int* ALIGNED( 64 ))a00 )[11] = b11.i[0]; + ( (int* ALIGNED( 64 ))a00 )[12] = b12.i[0]; + ( (int* ALIGNED( 64 ))a00 )[13] = b13.i[0]; + ( (int* ALIGNED( 64 ))a00 )[14] = b14.i[0]; + ( (int* ALIGNED( 64 ))a00 )[15] = b15.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[1]; + ( (int* ALIGNED( 64 ))a01 )[8] = b08.i[1]; + ( (int* ALIGNED( 64 ))a01 )[9] = b09.i[1]; + ( (int* ALIGNED( 64 ))a01 )[10] = b10.i[1]; + ( (int* ALIGNED( 64 ))a01 )[11] = b11.i[1]; + ( (int* ALIGNED( 64 ))a01 )[12] = b12.i[1]; + ( (int* ALIGNED( 64 ))a01 )[13] = b13.i[1]; + ( (int* ALIGNED( 64 ))a01 )[14] = b14.i[1]; + ( (int* ALIGNED( 64 ))a01 )[15] = b15.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a02 )[8] = b08.i[2]; + ( (int* ALIGNED( 64 ))a02 )[9] = b09.i[2]; + ( (int* ALIGNED( 64 ))a02 )[10] = b10.i[2]; + ( (int* ALIGNED( 64 ))a02 )[11] = b11.i[2]; + ( (int* ALIGNED( 64 ))a02 )[12] = b12.i[2]; + ( (int* ALIGNED( 64 ))a02 )[13] = b13.i[2]; + ( (int* ALIGNED( 64 ))a02 )[14] = b14.i[2]; + ( (int* ALIGNED( 64 ))a02 )[15] = b15.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[3]; + ( (int* ALIGNED( 64 ))a03 )[8] = b08.i[3]; + ( (int* ALIGNED( 64 ))a03 )[9] = b09.i[3]; + ( (int* ALIGNED( 64 ))a03 )[10] = b10.i[3]; + ( (int* ALIGNED( 64 ))a03 )[11] = b11.i[3]; + ( (int* ALIGNED( 64 ))a03 )[12] = b12.i[3]; + ( (int* ALIGNED( 64 ))a03 )[13] = b13.i[3]; + ( (int* ALIGNED( 64 ))a03 )[14] = b14.i[3]; + ( (int* ALIGNED( 64 ))a03 )[15] = b15.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a04 )[8] = b08.i[4]; + ( (int* ALIGNED( 64 ))a04 )[9] = b09.i[4]; + ( (int* ALIGNED( 64 ))a04 )[10] = b10.i[4]; + ( (int* ALIGNED( 64 ))a04 )[11] = b11.i[4]; + ( (int* ALIGNED( 64 ))a04 )[12] = b12.i[4]; + ( (int* ALIGNED( 64 ))a04 )[13] = b13.i[4]; + ( (int* ALIGNED( 64 ))a04 )[14] = b14.i[4]; + ( (int* ALIGNED( 64 ))a04 )[15] = b15.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[5]; + ( (int* ALIGNED( 64 ))a05 )[8] = b08.i[5]; + ( (int* ALIGNED( 64 ))a05 )[9] = b09.i[5]; + ( (int* ALIGNED( 64 ))a05 )[10] = b10.i[5]; + ( (int* ALIGNED( 64 ))a05 )[11] = b11.i[5]; + ( (int* ALIGNED( 64 ))a05 )[12] = b12.i[5]; + ( (int* ALIGNED( 64 ))a05 )[13] = b13.i[5]; + ( (int* ALIGNED( 64 ))a05 )[14] = b14.i[5]; + ( (int* ALIGNED( 64 ))a05 )[15] = b15.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a06 )[8] = b08.i[6]; + ( (int* ALIGNED( 64 ))a06 )[9] = b09.i[6]; + ( (int* ALIGNED( 64 ))a06 )[10] = b10.i[6]; + ( (int* ALIGNED( 64 ))a06 )[11] = b11.i[6]; + ( (int* ALIGNED( 64 ))a06 )[12] = b12.i[6]; + ( (int* ALIGNED( 64 ))a06 )[13] = b13.i[6]; + ( (int* ALIGNED( 64 ))a06 )[14] = b14.i[6]; + ( (int* ALIGNED( 64 ))a06 )[15] = b15.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[7]; + ( (int* ALIGNED( 64 ))a07 )[8] = b08.i[7]; + ( (int* ALIGNED( 64 ))a07 )[9] = b09.i[7]; + ( (int* ALIGNED( 64 ))a07 )[10] = b10.i[7]; + ( (int* ALIGNED( 64 ))a07 )[11] = b11.i[7]; + ( (int* ALIGNED( 64 ))a07 )[12] = b12.i[7]; + ( (int* ALIGNED( 64 ))a07 )[13] = b13.i[7]; + ( (int* ALIGNED( 64 ))a07 )[14] = b14.i[7]; + ( (int* ALIGNED( 64 ))a07 )[15] = b15.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a08 )[8] = b08.i[8]; + ( (int* ALIGNED( 64 ))a08 )[9] = b09.i[8]; + ( (int* ALIGNED( 64 ))a08 )[10] = b10.i[8]; + ( (int* ALIGNED( 64 ))a08 )[11] = b11.i[8]; + ( (int* ALIGNED( 64 ))a08 )[12] = b12.i[8]; + ( (int* ALIGNED( 64 ))a08 )[13] = b13.i[8]; + ( (int* ALIGNED( 64 ))a08 )[14] = b14.i[8]; + ( (int* ALIGNED( 64 ))a08 )[15] = b15.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = b00.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b01.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = b02.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = b03.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = b04.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = b05.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = b06.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = b07.i[9]; + ( (int* ALIGNED( 64 ))a09 )[8] = b08.i[9]; + ( (int* ALIGNED( 64 ))a09 )[9] = b09.i[9]; + ( (int* ALIGNED( 64 ))a09 )[10] = b10.i[9]; + ( (int* ALIGNED( 64 ))a09 )[11] = b11.i[9]; + ( (int* ALIGNED( 64 ))a09 )[12] = b12.i[9]; + ( (int* ALIGNED( 64 ))a09 )[13] = b13.i[9]; + ( (int* ALIGNED( 64 ))a09 )[14] = b14.i[9]; + ( (int* ALIGNED( 64 ))a09 )[15] = b15.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a10 )[8] = b08.i[10]; + ( (int* ALIGNED( 64 ))a10 )[9] = b09.i[10]; + ( (int* ALIGNED( 64 ))a10 )[10] = b10.i[10]; + ( (int* ALIGNED( 64 ))a10 )[11] = b11.i[10]; + ( (int* ALIGNED( 64 ))a10 )[12] = b12.i[10]; + ( (int* ALIGNED( 64 ))a10 )[13] = b13.i[10]; + ( (int* ALIGNED( 64 ))a10 )[14] = b14.i[10]; + ( (int* ALIGNED( 64 ))a10 )[15] = b15.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = b00.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b01.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = b02.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = b03.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = b04.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = b05.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = b06.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = b07.i[11]; + ( (int* ALIGNED( 64 ))a11 )[8] = b08.i[11]; + ( (int* ALIGNED( 64 ))a11 )[9] = b09.i[11]; + ( (int* ALIGNED( 64 ))a11 )[10] = b10.i[11]; + ( (int* ALIGNED( 64 ))a11 )[11] = b11.i[11]; + ( (int* ALIGNED( 64 ))a11 )[12] = b12.i[11]; + ( (int* ALIGNED( 64 ))a11 )[13] = b13.i[11]; + ( (int* ALIGNED( 64 ))a11 )[14] = b14.i[11]; + ( (int* ALIGNED( 64 ))a11 )[15] = b15.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a12 )[8] = b08.i[12]; + ( (int* ALIGNED( 64 ))a12 )[9] = b09.i[12]; + ( (int* ALIGNED( 64 ))a12 )[10] = b10.i[12]; + ( (int* ALIGNED( 64 ))a12 )[11] = b11.i[12]; + ( (int* ALIGNED( 64 ))a12 )[12] = b12.i[12]; + ( (int* ALIGNED( 64 ))a12 )[13] = b13.i[12]; + ( (int* ALIGNED( 64 ))a12 )[14] = b14.i[12]; + ( (int* ALIGNED( 64 ))a12 )[15] = b15.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = b00.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b01.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = b02.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = b03.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = b04.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = b05.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = b06.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = b07.i[13]; + ( (int* ALIGNED( 64 ))a13 )[8] = b08.i[13]; + ( (int* ALIGNED( 64 ))a13 )[9] = b09.i[13]; + ( (int* ALIGNED( 64 ))a13 )[10] = b10.i[13]; + ( (int* ALIGNED( 64 ))a13 )[11] = b11.i[13]; + ( (int* ALIGNED( 64 ))a13 )[12] = b12.i[13]; + ( (int* ALIGNED( 64 ))a13 )[13] = b13.i[13]; + ( (int* ALIGNED( 64 ))a13 )[14] = b14.i[13]; + ( (int* ALIGNED( 64 ))a13 )[15] = b15.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a14 )[8] = b08.i[14]; + ( (int* ALIGNED( 64 ))a14 )[9] = b09.i[14]; + ( (int* ALIGNED( 64 ))a14 )[10] = b10.i[14]; + ( (int* ALIGNED( 64 ))a14 )[11] = b11.i[14]; + ( (int* ALIGNED( 64 ))a14 )[12] = b12.i[14]; + ( (int* ALIGNED( 64 ))a14 )[13] = b13.i[14]; + ( (int* ALIGNED( 64 ))a14 )[14] = b14.i[14]; + ( (int* ALIGNED( 64 ))a14 )[15] = b15.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = b00.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b01.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = b02.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = b03.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = b04.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = b05.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = b06.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = b07.i[15]; + ( (int* ALIGNED( 64 ))a15 )[8] = b08.i[15]; + ( (int* ALIGNED( 64 ))a15 )[9] = b09.i[15]; + ( (int* ALIGNED( 64 ))a15 )[10] = b10.i[15]; + ( (int* ALIGNED( 64 ))a15 )[11] = b11.i[15]; + ( (int* ALIGNED( 64 ))a15 )[12] = b12.i[15]; + ( (int* ALIGNED( 64 ))a15 )[13] = b13.i[15]; + ( (int* ALIGNED( 64 ))a15 )[14] = b14.i[15]; + ( (int* ALIGNED( 64 ))a15 )[15] = b15.i[15]; +} + +inline void store_16x8_tr_p( const v16& b00, const v16& b01, const v16& b02, + const v16& b03, const v16& b04, const v16& b05, + const v16& b06, const v16& b07, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b00.i[1]; + ( (int* ALIGNED( 64 ))a00 )[9] = b01.i[1]; + ( (int* ALIGNED( 64 ))a00 )[10] = b02.i[1]; + ( (int* ALIGNED( 64 ))a00 )[11] = b03.i[1]; + ( (int* ALIGNED( 64 ))a00 )[12] = b04.i[1]; + ( (int* ALIGNED( 64 ))a00 )[13] = b05.i[1]; + ( (int* ALIGNED( 64 ))a00 )[14] = b06.i[1]; + ( (int* ALIGNED( 64 ))a00 )[15] = b07.i[1]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a01 )[8] = b00.i[3]; + ( (int* ALIGNED( 64 ))a01 )[9] = b01.i[3]; + ( (int* ALIGNED( 64 ))a01 )[10] = b02.i[3]; + ( (int* ALIGNED( 64 ))a01 )[11] = b03.i[3]; + ( (int* ALIGNED( 64 ))a01 )[12] = b04.i[3]; + ( (int* ALIGNED( 64 ))a01 )[13] = b05.i[3]; + ( (int* ALIGNED( 64 ))a01 )[14] = b06.i[3]; + ( (int* ALIGNED( 64 ))a01 )[15] = b07.i[3]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a02 )[8] = b00.i[5]; + ( (int* ALIGNED( 64 ))a02 )[9] = b01.i[5]; + ( (int* ALIGNED( 64 ))a02 )[10] = b02.i[5]; + ( (int* ALIGNED( 64 ))a02 )[11] = b03.i[5]; + ( (int* ALIGNED( 64 ))a02 )[12] = b04.i[5]; + ( (int* ALIGNED( 64 ))a02 )[13] = b05.i[5]; + ( (int* ALIGNED( 64 ))a02 )[14] = b06.i[5]; + ( (int* ALIGNED( 64 ))a02 )[15] = b07.i[5]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a03 )[8] = b00.i[7]; + ( (int* ALIGNED( 64 ))a03 )[9] = b01.i[7]; + ( (int* ALIGNED( 64 ))a03 )[10] = b02.i[7]; + ( (int* ALIGNED( 64 ))a03 )[11] = b03.i[7]; + ( (int* ALIGNED( 64 ))a03 )[12] = b04.i[7]; + ( (int* ALIGNED( 64 ))a03 )[13] = b05.i[7]; + ( (int* ALIGNED( 64 ))a03 )[14] = b06.i[7]; + ( (int* ALIGNED( 64 ))a03 )[15] = b07.i[7]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a04 )[8] = b00.i[9]; + ( (int* ALIGNED( 64 ))a04 )[9] = b01.i[9]; + ( (int* ALIGNED( 64 ))a04 )[10] = b02.i[9]; + ( (int* ALIGNED( 64 ))a04 )[11] = b03.i[9]; + ( (int* ALIGNED( 64 ))a04 )[12] = b04.i[9]; + ( (int* ALIGNED( 64 ))a04 )[13] = b05.i[9]; + ( (int* ALIGNED( 64 ))a04 )[14] = b06.i[9]; + ( (int* ALIGNED( 64 ))a04 )[15] = b07.i[9]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a05 )[8] = b00.i[11]; + ( (int* ALIGNED( 64 ))a05 )[9] = b01.i[11]; + ( (int* ALIGNED( 64 ))a05 )[10] = b02.i[11]; + ( (int* ALIGNED( 64 ))a05 )[11] = b03.i[11]; + ( (int* ALIGNED( 64 ))a05 )[12] = b04.i[11]; + ( (int* ALIGNED( 64 ))a05 )[13] = b05.i[11]; + ( (int* ALIGNED( 64 ))a05 )[14] = b06.i[11]; + ( (int* ALIGNED( 64 ))a05 )[15] = b07.i[11]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a06 )[8] = b00.i[13]; + ( (int* ALIGNED( 64 ))a06 )[9] = b01.i[13]; + ( (int* ALIGNED( 64 ))a06 )[10] = b02.i[13]; + ( (int* ALIGNED( 64 ))a06 )[11] = b03.i[13]; + ( (int* ALIGNED( 64 ))a06 )[12] = b04.i[13]; + ( (int* ALIGNED( 64 ))a06 )[13] = b05.i[13]; + ( (int* ALIGNED( 64 ))a06 )[14] = b06.i[13]; + ( (int* ALIGNED( 64 ))a06 )[15] = b07.i[13]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a07 )[8] = b00.i[15]; + ( (int* ALIGNED( 64 ))a07 )[9] = b01.i[15]; + ( (int* ALIGNED( 64 ))a07 )[10] = b02.i[15]; + ( (int* ALIGNED( 64 ))a07 )[11] = b03.i[15]; + ( (int* ALIGNED( 64 ))a07 )[12] = b04.i[15]; + ( (int* ALIGNED( 64 ))a07 )[13] = b05.i[15]; + ( (int* ALIGNED( 64 ))a07 )[14] = b06.i[15]; + ( (int* ALIGNED( 64 ))a07 )[15] = b07.i[15]; +} + +inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b00.i[1]; + ( (int* ALIGNED( 64 ))a00 )[9] = b01.i[1]; + ( (int* ALIGNED( 64 ))a00 )[10] = b02.i[1]; + ( (int* ALIGNED( 64 ))a00 )[11] = b03.i[1]; + ( (int* ALIGNED( 64 ))a00 )[12] = b04.i[1]; + ( (int* ALIGNED( 64 ))a00 )[13] = b05.i[1]; + ( (int* ALIGNED( 64 ))a00 )[14] = b06.i[1]; + ( (int* ALIGNED( 64 ))a00 )[15] = b07.i[1]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a01 )[8] = b00.i[3]; + ( (int* ALIGNED( 64 ))a01 )[9] = b01.i[3]; + ( (int* ALIGNED( 64 ))a01 )[10] = b02.i[3]; + ( (int* ALIGNED( 64 ))a01 )[11] = b03.i[3]; + ( (int* ALIGNED( 64 ))a01 )[12] = b04.i[3]; + ( (int* ALIGNED( 64 ))a01 )[13] = b05.i[3]; + ( (int* ALIGNED( 64 ))a01 )[14] = b06.i[3]; + ( (int* ALIGNED( 64 ))a01 )[15] = b07.i[3]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a02 )[8] = b00.i[5]; + ( (int* ALIGNED( 64 ))a02 )[9] = b01.i[5]; + ( (int* ALIGNED( 64 ))a02 )[10] = b02.i[5]; + ( (int* ALIGNED( 64 ))a02 )[11] = b03.i[5]; + ( (int* ALIGNED( 64 ))a02 )[12] = b04.i[5]; + ( (int* ALIGNED( 64 ))a02 )[13] = b05.i[5]; + ( (int* ALIGNED( 64 ))a02 )[14] = b06.i[5]; + ( (int* ALIGNED( 64 ))a02 )[15] = b07.i[5]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a03 )[8] = b00.i[7]; + ( (int* ALIGNED( 64 ))a03 )[9] = b01.i[7]; + ( (int* ALIGNED( 64 ))a03 )[10] = b02.i[7]; + ( (int* ALIGNED( 64 ))a03 )[11] = b03.i[7]; + ( (int* ALIGNED( 64 ))a03 )[12] = b04.i[7]; + ( (int* ALIGNED( 64 ))a03 )[13] = b05.i[7]; + ( (int* ALIGNED( 64 ))a03 )[14] = b06.i[7]; + ( (int* ALIGNED( 64 ))a03 )[15] = b07.i[7]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a04 )[8] = b00.i[9]; + ( (int* ALIGNED( 64 ))a04 )[9] = b01.i[9]; + ( (int* ALIGNED( 64 ))a04 )[10] = b02.i[9]; + ( (int* ALIGNED( 64 ))a04 )[11] = b03.i[9]; + ( (int* ALIGNED( 64 ))a04 )[12] = b04.i[9]; + ( (int* ALIGNED( 64 ))a04 )[13] = b05.i[9]; + ( (int* ALIGNED( 64 ))a04 )[14] = b06.i[9]; + ( (int* ALIGNED( 64 ))a04 )[15] = b07.i[9]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a05 )[8] = b00.i[11]; + ( (int* ALIGNED( 64 ))a05 )[9] = b01.i[11]; + ( (int* ALIGNED( 64 ))a05 )[10] = b02.i[11]; + ( (int* ALIGNED( 64 ))a05 )[11] = b03.i[11]; + ( (int* ALIGNED( 64 ))a05 )[12] = b04.i[11]; + ( (int* ALIGNED( 64 ))a05 )[13] = b05.i[11]; + ( (int* ALIGNED( 64 ))a05 )[14] = b06.i[11]; + ( (int* ALIGNED( 64 ))a05 )[15] = b07.i[11]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a06 )[8] = b00.i[13]; + ( (int* ALIGNED( 64 ))a06 )[9] = b01.i[13]; + ( (int* ALIGNED( 64 ))a06 )[10] = b02.i[13]; + ( (int* ALIGNED( 64 ))a06 )[11] = b03.i[13]; + ( (int* ALIGNED( 64 ))a06 )[12] = b04.i[13]; + ( (int* ALIGNED( 64 ))a06 )[13] = b05.i[13]; + ( (int* ALIGNED( 64 ))a06 )[14] = b06.i[13]; + ( (int* ALIGNED( 64 ))a06 )[15] = b07.i[13]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a07 )[8] = b00.i[15]; + ( (int* ALIGNED( 64 ))a07 )[9] = b01.i[15]; + ( (int* ALIGNED( 64 ))a07 )[10] = b02.i[15]; + ( (int* ALIGNED( 64 ))a07 )[11] = b03.i[15]; + ( (int* ALIGNED( 64 ))a07 )[12] = b04.i[15]; + ( (int* ALIGNED( 64 ))a07 )[13] = b05.i[15]; + ( (int* ALIGNED( 64 ))a07 )[14] = b06.i[15]; + ( (int* ALIGNED( 64 ))a07 )[15] = b07.i[15]; + + ( (int* ALIGNED( 64 ))a08 )[0] = b08.i[0]; + ( (int* ALIGNED( 64 ))a08 )[1] = b09.i[0]; + ( (int* ALIGNED( 64 ))a08 )[2] = b10.i[0]; + ( (int* ALIGNED( 64 ))a08 )[3] = b11.i[0]; + ( (int* ALIGNED( 64 ))a08 )[4] = b12.i[0]; + ( (int* ALIGNED( 64 ))a08 )[5] = b13.i[0]; + ( (int* ALIGNED( 64 ))a08 )[6] = b14.i[0]; + ( (int* ALIGNED( 64 ))a08 )[7] = b15.i[0]; + ( (int* ALIGNED( 64 ))a08 )[8] = b08.i[1]; + ( (int* ALIGNED( 64 ))a08 )[9] = b09.i[1]; + ( (int* ALIGNED( 64 ))a08 )[10] = b10.i[1]; + ( (int* ALIGNED( 64 ))a08 )[11] = b11.i[1]; + ( (int* ALIGNED( 64 ))a08 )[12] = b12.i[1]; + ( (int* ALIGNED( 64 ))a08 )[13] = b13.i[1]; + ( (int* ALIGNED( 64 ))a08 )[14] = b14.i[1]; + ( (int* ALIGNED( 64 ))a08 )[15] = b15.i[1]; + + ( (int* ALIGNED( 64 ))a09 )[0] = b08.i[2]; + ( (int* ALIGNED( 64 ))a09 )[1] = b09.i[2]; + ( (int* ALIGNED( 64 ))a09 )[2] = b10.i[2]; + ( (int* ALIGNED( 64 ))a09 )[3] = b11.i[2]; + ( (int* ALIGNED( 64 ))a09 )[4] = b12.i[2]; + ( (int* ALIGNED( 64 ))a09 )[5] = b13.i[2]; + ( (int* ALIGNED( 64 ))a09 )[6] = b14.i[2]; + ( (int* ALIGNED( 64 ))a09 )[7] = b15.i[2]; + ( (int* ALIGNED( 64 ))a09 )[8] = b08.i[3]; + ( (int* ALIGNED( 64 ))a09 )[9] = b09.i[3]; + ( (int* ALIGNED( 64 ))a09 )[10] = b10.i[3]; + ( (int* ALIGNED( 64 ))a09 )[11] = b11.i[3]; + ( (int* ALIGNED( 64 ))a09 )[12] = b12.i[3]; + ( (int* ALIGNED( 64 ))a09 )[13] = b13.i[3]; + ( (int* ALIGNED( 64 ))a09 )[14] = b14.i[3]; + ( (int* ALIGNED( 64 ))a09 )[15] = b15.i[3]; + + ( (int* ALIGNED( 64 ))a10 )[0] = b08.i[4]; + ( (int* ALIGNED( 64 ))a10 )[1] = b09.i[4]; + ( (int* ALIGNED( 64 ))a10 )[2] = b10.i[4]; + ( (int* ALIGNED( 64 ))a10 )[3] = b11.i[4]; + ( (int* ALIGNED( 64 ))a10 )[4] = b12.i[4]; + ( (int* ALIGNED( 64 ))a10 )[5] = b13.i[4]; + ( (int* ALIGNED( 64 ))a10 )[6] = b14.i[4]; + ( (int* ALIGNED( 64 ))a10 )[7] = b15.i[4]; + ( (int* ALIGNED( 64 ))a10 )[8] = b08.i[5]; + ( (int* ALIGNED( 64 ))a10 )[9] = b09.i[5]; + ( (int* ALIGNED( 64 ))a10 )[10] = b10.i[5]; + ( (int* ALIGNED( 64 ))a10 )[11] = b11.i[5]; + ( (int* ALIGNED( 64 ))a10 )[12] = b12.i[5]; + ( (int* ALIGNED( 64 ))a10 )[13] = b13.i[5]; + ( (int* ALIGNED( 64 ))a10 )[14] = b14.i[5]; + ( (int* ALIGNED( 64 ))a10 )[15] = b15.i[5]; + + ( (int* ALIGNED( 64 ))a11 )[0] = b08.i[6]; + ( (int* ALIGNED( 64 ))a11 )[1] = b09.i[6]; + ( (int* ALIGNED( 64 ))a11 )[2] = b10.i[6]; + ( (int* ALIGNED( 64 ))a11 )[3] = b11.i[6]; + ( (int* ALIGNED( 64 ))a11 )[4] = b12.i[6]; + ( (int* ALIGNED( 64 ))a11 )[5] = b13.i[6]; + ( (int* ALIGNED( 64 ))a11 )[6] = b14.i[6]; + ( (int* ALIGNED( 64 ))a11 )[7] = b15.i[6]; + ( (int* ALIGNED( 64 ))a11 )[8] = b08.i[7]; + ( (int* ALIGNED( 64 ))a11 )[9] = b09.i[7]; + ( (int* ALIGNED( 64 ))a11 )[10] = b10.i[7]; + ( (int* ALIGNED( 64 ))a11 )[11] = b11.i[7]; + ( (int* ALIGNED( 64 ))a11 )[12] = b12.i[7]; + ( (int* ALIGNED( 64 ))a11 )[13] = b13.i[7]; + ( (int* ALIGNED( 64 ))a11 )[14] = b14.i[7]; + ( (int* ALIGNED( 64 ))a11 )[15] = b15.i[7]; + + ( (int* ALIGNED( 64 ))a12 )[0] = b08.i[8]; + ( (int* ALIGNED( 64 ))a12 )[1] = b09.i[8]; + ( (int* ALIGNED( 64 ))a12 )[2] = b10.i[8]; + ( (int* ALIGNED( 64 ))a12 )[3] = b11.i[8]; + ( (int* ALIGNED( 64 ))a12 )[4] = b12.i[8]; + ( (int* ALIGNED( 64 ))a12 )[5] = b13.i[8]; + ( (int* ALIGNED( 64 ))a12 )[6] = b14.i[8]; + ( (int* ALIGNED( 64 ))a12 )[7] = b15.i[8]; + ( (int* ALIGNED( 64 ))a12 )[8] = b08.i[9]; + ( (int* ALIGNED( 64 ))a12 )[9] = b09.i[9]; + ( (int* ALIGNED( 64 ))a12 )[10] = b10.i[9]; + ( (int* ALIGNED( 64 ))a12 )[11] = b11.i[9]; + ( (int* ALIGNED( 64 ))a12 )[12] = b12.i[9]; + ( (int* ALIGNED( 64 ))a12 )[13] = b13.i[9]; + ( (int* ALIGNED( 64 ))a12 )[14] = b14.i[9]; + ( (int* ALIGNED( 64 ))a12 )[15] = b15.i[9]; + + ( (int* ALIGNED( 64 ))a13 )[0] = b08.i[10]; + ( (int* ALIGNED( 64 ))a13 )[1] = b09.i[10]; + ( (int* ALIGNED( 64 ))a13 )[2] = b10.i[10]; + ( (int* ALIGNED( 64 ))a13 )[3] = b11.i[10]; + ( (int* ALIGNED( 64 ))a13 )[4] = b12.i[10]; + ( (int* ALIGNED( 64 ))a13 )[5] = b13.i[10]; + ( (int* ALIGNED( 64 ))a13 )[6] = b14.i[10]; + ( (int* ALIGNED( 64 ))a13 )[7] = b15.i[10]; + ( (int* ALIGNED( 64 ))a13 )[8] = b08.i[11]; + ( (int* ALIGNED( 64 ))a13 )[9] = b09.i[11]; + ( (int* ALIGNED( 64 ))a13 )[10] = b10.i[11]; + ( (int* ALIGNED( 64 ))a13 )[11] = b11.i[11]; + ( (int* ALIGNED( 64 ))a13 )[12] = b12.i[11]; + ( (int* ALIGNED( 64 ))a13 )[13] = b13.i[11]; + ( (int* ALIGNED( 64 ))a13 )[14] = b14.i[11]; + ( (int* ALIGNED( 64 ))a13 )[15] = b15.i[11]; + + ( (int* ALIGNED( 64 ))a14 )[0] = b08.i[12]; + ( (int* ALIGNED( 64 ))a14 )[1] = b09.i[12]; + ( (int* ALIGNED( 64 ))a14 )[2] = b10.i[12]; + ( (int* ALIGNED( 64 ))a14 )[3] = b11.i[12]; + ( (int* ALIGNED( 64 ))a14 )[4] = b12.i[12]; + ( (int* ALIGNED( 64 ))a14 )[5] = b13.i[12]; + ( (int* ALIGNED( 64 ))a14 )[6] = b14.i[12]; + ( (int* ALIGNED( 64 ))a14 )[7] = b15.i[12]; + ( (int* ALIGNED( 64 ))a14 )[8] = b08.i[13]; + ( (int* ALIGNED( 64 ))a14 )[9] = b09.i[13]; + ( (int* ALIGNED( 64 ))a14 )[10] = b10.i[13]; + ( (int* ALIGNED( 64 ))a14 )[11] = b11.i[13]; + ( (int* ALIGNED( 64 ))a14 )[12] = b12.i[13]; + ( (int* ALIGNED( 64 ))a14 )[13] = b13.i[13]; + ( (int* ALIGNED( 64 ))a14 )[14] = b14.i[13]; + ( (int* ALIGNED( 64 ))a14 )[15] = b15.i[13]; + + ( (int* ALIGNED( 64 ))a15 )[0] = b08.i[14]; + ( (int* ALIGNED( 64 ))a15 )[1] = b09.i[14]; + ( (int* ALIGNED( 64 ))a15 )[2] = b10.i[14]; + ( (int* ALIGNED( 64 ))a15 )[3] = b11.i[14]; + ( (int* ALIGNED( 64 ))a15 )[4] = b12.i[14]; + ( (int* ALIGNED( 64 ))a15 )[5] = b13.i[14]; + ( (int* ALIGNED( 64 ))a15 )[6] = b14.i[14]; + ( (int* ALIGNED( 64 ))a15 )[7] = b15.i[14]; + ( (int* ALIGNED( 64 ))a15 )[8] = b08.i[15]; + ( (int* ALIGNED( 64 ))a15 )[9] = b09.i[15]; + ( (int* ALIGNED( 64 ))a15 )[10] = b10.i[15]; + ( (int* ALIGNED( 64 ))a15 )[11] = b11.i[15]; + ( (int* ALIGNED( 64 ))a15 )[12] = b12.i[15]; + ( (int* ALIGNED( 64 ))a15 )[13] = b13.i[15]; + ( (int* ALIGNED( 64 ))a15 )[14] = b14.i[15]; + ( (int* ALIGNED( 64 ))a15 )[15] = b15.i[15]; +} + +////////////// +// v16int class + +class v16int : public v16 +{ // v16int prefix unary operator friends - friend inline v16int operator +( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator ~( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16int & a ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator~( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16int prefix increment / decrement operator friends - friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a ) ALWAYS_INLINE; // v16int postfix increment / decrement operator friends - friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a, int ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a, int ) ALWAYS_INLINE; // v16int binary operator friends - friend inline v16int operator +( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator *( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator /( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator %( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ^( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator |( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator*(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator/( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator%( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator^( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator|( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int logical operator friends - friend inline v16int operator <( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16int abs( const v16int &a ) ALWAYS_INLINE; - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; + friend inline v16int abs( const v16int& a ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& t, + const v16& f ) ALWAYS_INLINE; // v16float unary operator friends - friend inline v16int operator !( const v16float & a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float miscellaneous friends - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; public: - // v16int constructors / destructors - v16int() {} // Default constructor + v16int() {} // Default constructor - v16int( const v16int &a ) // Copy constructor + v16int( const v16int& a ) // Copy constructor { - i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3]; - i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7]; - i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11]; - i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; + i[8] = a.i[8]; + i[9] = a.i[9]; + i[10] = a.i[10]; + i[11] = a.i[11]; + i[12] = a.i[12]; + i[13] = a.i[13]; + i[14] = a.i[14]; + i[15] = a.i[15]; } - v16int( const v16 &a ) // Init from mixed + v16int( const v16& a ) // Init from mixed { - i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3]; - i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7]; - i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11]; - i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; + i[8] = a.i[8]; + i[9] = a.i[9]; + i[10] = a.i[10]; + i[11] = a.i[11]; + i[12] = a.i[12]; + i[13] = a.i[13]; + i[14] = a.i[14]; + i[15] = a.i[15]; } - v16int( int a ) // Init from scalar + v16int( int a ) // Init from scalar { - i[ 0] = a; i[ 1] = a; i[ 2] = a; i[ 3] = a; - i[ 4] = a; i[ 5] = a; i[ 6] = a; i[ 7] = a; - i[ 8] = a; i[ 9] = a; i[10] = a; i[11] = a; - i[12] = a; i[13] = a; i[14] = a; i[15] = a; + i[0] = a; + i[1] = a; + i[2] = a; + i[3] = a; + i[4] = a; + i[5] = a; + i[6] = a; + i[7] = a; + i[8] = a; + i[9] = a; + i[10] = a; + i[11] = a; + i[12] = a; + i[13] = a; + i[14] = a; + i[15] = a; } - v16int( int i00, int i01, int i02, int i03, - int i04, int i05, int i06, int i07, - int i08, int i09, int i10, int i11, - int i12, int i13, int i14, int i15 ) // Init from scalars + v16int( int i00, int i01, int i02, int i03, int i04, int i05, int i06, + int i07, int i08, int i09, int i10, int i11, int i12, int i13, + int i14, int i15 ) // Init from scalars { - i[ 0] = i00; i[ 1] = i01; i[ 2] = i02; i[ 3] = i03; - i[ 4] = i04; i[ 5] = i05; i[ 6] = i06; i[ 7] = i07; - i[ 8] = i08; i[ 9] = i09; i[10] = i10; i[11] = i11; - i[12] = i12; i[13] = i13; i[14] = i14; i[15] = i15; + i[0] = i00; + i[1] = i01; + i[2] = i02; + i[3] = i03; + i[4] = i04; + i[5] = i05; + i[6] = i06; + i[7] = i07; + i[8] = i08; + i[9] = i09; + i[10] = i10; + i[11] = i11; + i[12] = i12; + i[13] = i13; + i[14] = i14; + i[15] = i15; } - ~v16int() {} // Destructor + ~v16int() {} // Destructor // v16int assignment operators -# define ASSIGN(op) \ - inline v16int &operator op( const v16int &b ) \ - { \ - i[ 0] op b.i[ 0]; \ - i[ 1] op b.i[ 1]; \ - i[ 2] op b.i[ 2]; \ - i[ 3] op b.i[ 3]; \ - i[ 4] op b.i[ 4]; \ - i[ 5] op b.i[ 5]; \ - i[ 6] op b.i[ 6]; \ - i[ 7] op b.i[ 7]; \ - i[ 8] op b.i[ 8]; \ - i[ 9] op b.i[ 9]; \ - i[10] op b.i[10]; \ - i[11] op b.i[11]; \ - i[12] op b.i[12]; \ - i[13] op b.i[13]; \ - i[14] op b.i[14]; \ - i[15] op b.i[15]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16int& operator op( const v16int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + i[4] op b.i[4]; \ + i[5] op b.i[5]; \ + i[6] op b.i[6]; \ + i[7] op b.i[7]; \ + i[8] op b.i[8]; \ + i[9] op b.i[9]; \ + i[10] op b.i[10]; \ + i[11] op b.i[11]; \ + i[12] op b.i[12]; \ + i[13] op b.i[13]; \ + i[14] op b.i[14]; \ + i[15] op b.i[15]; \ + return *this; \ } - ASSIGN( =) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v16int member access operator - inline int &operator []( int n ) - { - return i[n]; + inline int& operator[]( int n ) { return i[n]; } + + inline int operator()( int n ) { return i[n]; } +}; + +// v16int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v16int operator op( const v16int& a ) \ + { \ + v16int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + b.i[8] = ( op a.i[8] ); \ + b.i[9] = ( op a.i[9] ); \ + b.i[10] = ( op a.i[10] ); \ + b.i[11] = ( op a.i[11] ); \ + b.i[12] = ( op a.i[12] ); \ + b.i[13] = ( op a.i[13] ); \ + b.i[14] = ( op a.i[14] ); \ + b.i[15] = ( op a.i[15] ); \ + return b; \ } - inline int operator ()( int n ) - { - return i[n]; - } - }; - - // v16int prefix unary operators - -# define PREFIX_UNARY(op) \ - inline v16int operator op( const v16int & a ) \ - { \ - v16int b; \ - b.i[ 0] = (op a.i[ 0]); \ - b.i[ 1] = (op a.i[ 1]); \ - b.i[ 2] = (op a.i[ 2]); \ - b.i[ 3] = (op a.i[ 3]); \ - b.i[ 4] = (op a.i[ 4]); \ - b.i[ 5] = (op a.i[ 5]); \ - b.i[ 6] = (op a.i[ 6]); \ - b.i[ 7] = (op a.i[ 7]); \ - b.i[ 8] = (op a.i[ 8]); \ - b.i[ 9] = (op a.i[ 9]); \ - b.i[10] = (op a.i[10]); \ - b.i[11] = (op a.i[11]); \ - b.i[12] = (op a.i[12]); \ - b.i[13] = (op a.i[13]); \ - b.i[14] = (op a.i[14]); \ - b.i[15] = (op a.i[15]); \ - return b; \ - } - - PREFIX_UNARY(+) - PREFIX_UNARY(-) - - inline v16int operator !( const v16int & a ) - { +PREFIX_UNARY( +) +PREFIX_UNARY( -) + +inline v16int operator!( const v16int& a ) +{ v16int b; - b.i[ 0] = - ( !a.i[ 0] ); - b.i[ 1] = - ( !a.i[ 1] ); - b.i[ 2] = - ( !a.i[ 2] ); - b.i[ 3] = - ( !a.i[ 3] ); - b.i[ 4] = - ( !a.i[ 4] ); - b.i[ 5] = - ( !a.i[ 5] ); - b.i[ 6] = - ( !a.i[ 6] ); - b.i[ 7] = - ( !a.i[ 7] ); - b.i[ 8] = - ( !a.i[ 8] ); - b.i[ 9] = - ( !a.i[ 9] ); - b.i[10] = - ( !a.i[10] ); - b.i[11] = - ( !a.i[11] ); - b.i[12] = - ( !a.i[12] ); - b.i[13] = - ( !a.i[13] ); - b.i[14] = - ( !a.i[14] ); - b.i[15] = - ( !a.i[15] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); + b.i[4] = -( !a.i[4] ); + b.i[5] = -( !a.i[5] ); + b.i[6] = -( !a.i[6] ); + b.i[7] = -( !a.i[7] ); + b.i[8] = -( !a.i[8] ); + b.i[9] = -( !a.i[9] ); + b.i[10] = -( !a.i[10] ); + b.i[11] = -( !a.i[11] ); + b.i[12] = -( !a.i[12] ); + b.i[13] = -( !a.i[13] ); + b.i[14] = -( !a.i[14] ); + b.i[15] = -( !a.i[15] ); return b; - } - - PREFIX_UNARY(~) - -# undef PREFIX_UNARY - - // v16int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v16int operator op( v16int & a ) \ - { \ - v16int b; \ - b.i[ 0] = ( op a.i[ 0] ); \ - b.i[ 1] = ( op a.i[ 1] ); \ - b.i[ 2] = ( op a.i[ 2] ); \ - b.i[ 3] = ( op a.i[ 3] ); \ - b.i[ 4] = ( op a.i[ 4] ); \ - b.i[ 5] = ( op a.i[ 5] ); \ - b.i[ 6] = ( op a.i[ 6] ); \ - b.i[ 7] = ( op a.i[ 7] ); \ - b.i[ 8] = ( op a.i[ 8] ); \ - b.i[ 9] = ( op a.i[ 9] ); \ - b.i[10] = ( op a.i[10] ); \ - b.i[11] = ( op a.i[11] ); \ - b.i[12] = ( op a.i[12] ); \ - b.i[13] = ( op a.i[13] ); \ - b.i[14] = ( op a.i[14] ); \ - b.i[15] = ( op a.i[15] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v16int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v16int operator op( v16int & a, int ) \ - { \ - v16int b; \ - b.i[ 0] = ( a.i[ 0] op ); \ - b.i[ 1] = ( a.i[ 1] op ); \ - b.i[ 2] = ( a.i[ 2] op ); \ - b.i[ 3] = ( a.i[ 3] op ); \ - b.i[ 4] = ( a.i[ 4] op ); \ - b.i[ 5] = ( a.i[ 5] op ); \ - b.i[ 6] = ( a.i[ 6] op ); \ - b.i[ 7] = ( a.i[ 7] op ); \ - b.i[ 8] = ( a.i[ 8] op ); \ - b.i[ 9] = ( a.i[ 9] op ); \ - b.i[10] = ( a.i[10] op ); \ - b.i[11] = ( a.i[11] op ); \ - b.i[12] = ( a.i[12] op ); \ - b.i[13] = ( a.i[13] op ); \ - b.i[14] = ( a.i[14] op ); \ - b.i[15] = ( a.i[15] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v16int binary operators - -# define BINARY(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - c.i[ 0] = a.i[ 0] op b.i[ 0]; \ - c.i[ 1] = a.i[ 1] op b.i[ 1]; \ - c.i[ 2] = a.i[ 2] op b.i[ 2]; \ - c.i[ 3] = a.i[ 3] op b.i[ 3]; \ - c.i[ 4] = a.i[ 4] op b.i[ 4]; \ - c.i[ 5] = a.i[ 5] op b.i[ 5]; \ - c.i[ 6] = a.i[ 6] op b.i[ 6]; \ - c.i[ 7] = a.i[ 7] op b.i[ 7]; \ - c.i[ 8] = a.i[ 8] op b.i[ 8]; \ - c.i[ 9] = a.i[ 9] op b.i[ 9]; \ - c.i[10] = a.i[10] op b.i[10]; \ - c.i[11] = a.i[11] op b.i[11]; \ - c.i[12] = a.i[12] op b.i[12]; \ - c.i[13] = a.i[13] op b.i[13]; \ - c.i[14] = a.i[14] op b.i[14]; \ - c.i[15] = a.i[15] op b.i[15]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v16int logical operators - -# define LOGICAL(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - c.i[ 0] = - ( a.i[ 0] op b.i[ 0] ); \ - c.i[ 1] = - ( a.i[ 1] op b.i[ 1] ); \ - c.i[ 2] = - ( a.i[ 2] op b.i[ 2] ); \ - c.i[ 3] = - ( a.i[ 3] op b.i[ 3] ); \ - c.i[ 4] = - ( a.i[ 4] op b.i[ 4] ); \ - c.i[ 5] = - ( a.i[ 5] op b.i[ 5] ); \ - c.i[ 6] = - ( a.i[ 6] op b.i[ 6] ); \ - c.i[ 7] = - ( a.i[ 7] op b.i[ 7] ); \ - c.i[ 8] = - ( a.i[ 8] op b.i[ 8] ); \ - c.i[ 9] = - ( a.i[ 9] op b.i[ 9] ); \ - c.i[10] = - ( a.i[10] op b.i[10] ); \ - c.i[11] = - ( a.i[11] op b.i[11] ); \ - c.i[12] = - ( a.i[12] op b.i[12] ); \ - c.i[13] = - ( a.i[13] op b.i[13] ); \ - c.i[14] = - ( a.i[14] op b.i[14] ); \ - c.i[15] = - ( a.i[15] op b.i[15] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v16int miscellaneous functions - - inline v16int abs( const v16int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v16int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a ) \ + { \ + v16int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + b.i[8] = ( op a.i[8] ); \ + b.i[9] = ( op a.i[9] ); \ + b.i[10] = ( op a.i[10] ); \ + b.i[11] = ( op a.i[11] ); \ + b.i[12] = ( op a.i[12] ); \ + b.i[13] = ( op a.i[13] ); \ + b.i[14] = ( op a.i[14] ); \ + b.i[15] = ( op a.i[15] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v16int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a, int ) \ + { \ + v16int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + b.i[4] = ( a.i[4] op ); \ + b.i[5] = ( a.i[5] op ); \ + b.i[6] = ( a.i[6] op ); \ + b.i[7] = ( a.i[7] op ); \ + b.i[8] = ( a.i[8] op ); \ + b.i[9] = ( a.i[9] op ); \ + b.i[10] = ( a.i[10] op ); \ + b.i[11] = ( a.i[11] op ); \ + b.i[12] = ( a.i[12] op ); \ + b.i[13] = ( a.i[13] op ); \ + b.i[14] = ( a.i[14] op ); \ + b.i[15] = ( a.i[15] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v16int binary operators + +#define BINARY( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + c.i[4] = a.i[4] op b.i[4]; \ + c.i[5] = a.i[5] op b.i[5]; \ + c.i[6] = a.i[6] op b.i[6]; \ + c.i[7] = a.i[7] op b.i[7]; \ + c.i[8] = a.i[8] op b.i[8]; \ + c.i[9] = a.i[9] op b.i[9]; \ + c.i[10] = a.i[10] op b.i[10]; \ + c.i[11] = a.i[11] op b.i[11]; \ + c.i[12] = a.i[12] op b.i[12]; \ + c.i[13] = a.i[13] op b.i[13]; \ + c.i[14] = a.i[14] op b.i[14]; \ + c.i[15] = a.i[15] op b.i[15]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( ^) +BINARY( & ) +BINARY( | ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v16int logical operators + +#define LOGICAL( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + c.i[4] = -( a.i[4] op b.i[4] ); \ + c.i[5] = -( a.i[5] op b.i[5] ); \ + c.i[6] = -( a.i[6] op b.i[6] ); \ + c.i[7] = -( a.i[7] op b.i[7] ); \ + c.i[8] = -( a.i[8] op b.i[8] ); \ + c.i[9] = -( a.i[9] op b.i[9] ); \ + c.i[10] = -( a.i[10] op b.i[10] ); \ + c.i[11] = -( a.i[11] op b.i[11] ); \ + c.i[12] = -( a.i[12] op b.i[12] ); \ + c.i[13] = -( a.i[13] op b.i[13] ); \ + c.i[14] = -( a.i[14] op b.i[14] ); \ + c.i[15] = -( a.i[15] op b.i[15] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v16int miscellaneous functions + +inline v16int abs( const v16int& a ) +{ v16int b; - b.i[ 0] = ( a.i[ 0] >= 0 ) ? a.i[ 0] : - a.i[ 0]; - b.i[ 1] = ( a.i[ 1] >= 0 ) ? a.i[ 1] : - a.i[ 1]; - b.i[ 2] = ( a.i[ 2] >= 0 ) ? a.i[ 2] : - a.i[ 2]; - b.i[ 3] = ( a.i[ 3] >= 0 ) ? a.i[ 3] : - a.i[ 3]; - b.i[ 4] = ( a.i[ 4] >= 0 ) ? a.i[ 4] : - a.i[ 4]; - b.i[ 5] = ( a.i[ 5] >= 0 ) ? a.i[ 5] : - a.i[ 5]; - b.i[ 6] = ( a.i[ 6] >= 0 ) ? a.i[ 6] : - a.i[ 6]; - b.i[ 7] = ( a.i[ 7] >= 0 ) ? a.i[ 7] : - a.i[ 7]; - b.i[ 8] = ( a.i[ 8] >= 0 ) ? a.i[ 8] : - a.i[ 8]; - b.i[ 9] = ( a.i[ 9] >= 0 ) ? a.i[ 9] : - a.i[ 9]; - b.i[10] = ( a.i[10] >= 0 ) ? a.i[10] : - a.i[10]; - b.i[11] = ( a.i[11] >= 0 ) ? a.i[11] : - a.i[11]; - b.i[12] = ( a.i[12] >= 0 ) ? a.i[12] : - a.i[12]; - b.i[13] = ( a.i[13] >= 0 ) ? a.i[13] : - a.i[13]; - b.i[14] = ( a.i[14] >= 0 ) ? a.i[14] : - a.i[14]; - b.i[15] = ( a.i[15] >= 0 ) ? a.i[15] : - a.i[15]; + b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; + b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1]; + b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2]; + b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; + b.i[4] = ( a.i[4] >= 0 ) ? a.i[4] : -a.i[4]; + b.i[5] = ( a.i[5] >= 0 ) ? a.i[5] : -a.i[5]; + b.i[6] = ( a.i[6] >= 0 ) ? a.i[6] : -a.i[6]; + b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7]; + b.i[8] = ( a.i[8] >= 0 ) ? a.i[8] : -a.i[8]; + b.i[9] = ( a.i[9] >= 0 ) ? a.i[9] : -a.i[9]; + b.i[10] = ( a.i[10] >= 0 ) ? a.i[10] : -a.i[10]; + b.i[11] = ( a.i[11] >= 0 ) ? a.i[11] : -a.i[11]; + b.i[12] = ( a.i[12] >= 0 ) ? a.i[12] : -a.i[12]; + b.i[13] = ( a.i[13] >= 0 ) ? a.i[13] : -a.i[13]; + b.i[14] = ( a.i[14] >= 0 ) ? a.i[14] : -a.i[14]; + b.i[15] = ( a.i[15] >= 0 ) ? a.i[15] : -a.i[15]; return b; - } +} - inline v16 czero( const v16int &c, const v16 &a ) - { +inline v16 czero( const v16int& c, const v16& a ) +{ v16 b; - b.i[ 0] = a.i[ 0] & ~c.i[ 0]; - b.i[ 1] = a.i[ 1] & ~c.i[ 1]; - b.i[ 2] = a.i[ 2] & ~c.i[ 2]; - b.i[ 3] = a.i[ 3] & ~c.i[ 3]; - b.i[ 4] = a.i[ 4] & ~c.i[ 4]; - b.i[ 5] = a.i[ 5] & ~c.i[ 5]; - b.i[ 6] = a.i[ 6] & ~c.i[ 6]; - b.i[ 7] = a.i[ 7] & ~c.i[ 7]; - b.i[ 8] = a.i[ 8] & ~c.i[ 8]; - b.i[ 9] = a.i[ 9] & ~c.i[ 9]; + b.i[0] = a.i[0] & ~c.i[0]; + b.i[1] = a.i[1] & ~c.i[1]; + b.i[2] = a.i[2] & ~c.i[2]; + b.i[3] = a.i[3] & ~c.i[3]; + b.i[4] = a.i[4] & ~c.i[4]; + b.i[5] = a.i[5] & ~c.i[5]; + b.i[6] = a.i[6] & ~c.i[6]; + b.i[7] = a.i[7] & ~c.i[7]; + b.i[8] = a.i[8] & ~c.i[8]; + b.i[9] = a.i[9] & ~c.i[9]; b.i[10] = a.i[10] & ~c.i[10]; b.i[11] = a.i[11] & ~c.i[11]; b.i[12] = a.i[12] & ~c.i[12]; @@ -3363,22 +3382,22 @@ namespace v16 b.i[15] = a.i[15] & ~c.i[15]; return b; - } +} - inline v16 notczero( const v16int &c, const v16 &a ) - { +inline v16 notczero( const v16int& c, const v16& a ) +{ v16 b; - b.i[ 0] = a.i[ 0] & c.i[ 0]; - b.i[ 1] = a.i[ 1] & c.i[ 1]; - b.i[ 2] = a.i[ 2] & c.i[ 2]; - b.i[ 3] = a.i[ 3] & c.i[ 3]; - b.i[ 4] = a.i[ 4] & c.i[ 4]; - b.i[ 5] = a.i[ 5] & c.i[ 5]; - b.i[ 6] = a.i[ 6] & c.i[ 6]; - b.i[ 7] = a.i[ 7] & c.i[ 7]; - b.i[ 8] = a.i[ 8] & c.i[ 8]; - b.i[ 9] = a.i[ 9] & c.i[ 9]; + b.i[0] = a.i[0] & c.i[0]; + b.i[1] = a.i[1] & c.i[1]; + b.i[2] = a.i[2] & c.i[2]; + b.i[3] = a.i[3] & c.i[3]; + b.i[4] = a.i[4] & c.i[4]; + b.i[5] = a.i[5] & c.i[5]; + b.i[6] = a.i[6] & c.i[6]; + b.i[7] = a.i[7] & c.i[7]; + b.i[8] = a.i[8] & c.i[8]; + b.i[9] = a.i[9] & c.i[9]; b.i[10] = a.i[10] & c.i[10]; b.i[11] = a.i[11] & c.i[11]; b.i[12] = a.i[12] & c.i[12]; @@ -3387,22 +3406,22 @@ namespace v16 b.i[15] = a.i[15] & c.i[15]; return b; - } +} - inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) - { +inline v16 merge( const v16int& c, const v16& t, const v16& f ) +{ v16 m; - m.i[ 0] = ( f.i[ 0] & ~c.i[ 0] ) | ( t.i[ 0] & c.i[ 0] ); - m.i[ 1] = ( f.i[ 1] & ~c.i[ 1] ) | ( t.i[ 1] & c.i[ 1] ); - m.i[ 2] = ( f.i[ 2] & ~c.i[ 2] ) | ( t.i[ 2] & c.i[ 2] ); - m.i[ 3] = ( f.i[ 3] & ~c.i[ 3] ) | ( t.i[ 3] & c.i[ 3] ); - m.i[ 4] = ( f.i[ 4] & ~c.i[ 4] ) | ( t.i[ 4] & c.i[ 4] ); - m.i[ 5] = ( f.i[ 5] & ~c.i[ 5] ) | ( t.i[ 5] & c.i[ 5] ); - m.i[ 6] = ( f.i[ 6] & ~c.i[ 6] ) | ( t.i[ 6] & c.i[ 6] ); - m.i[ 7] = ( f.i[ 7] & ~c.i[ 7] ) | ( t.i[ 7] & c.i[ 7] ); - m.i[ 8] = ( f.i[ 8] & ~c.i[ 8] ) | ( t.i[ 8] & c.i[ 8] ); - m.i[ 9] = ( f.i[ 9] & ~c.i[ 9] ) | ( t.i[ 9] & c.i[ 9] ); + m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); + m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] ); + m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] ); + m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); + m.i[4] = ( f.i[4] & ~c.i[4] ) | ( t.i[4] & c.i[4] ); + m.i[5] = ( f.i[5] & ~c.i[5] ) | ( t.i[5] & c.i[5] ); + m.i[6] = ( f.i[6] & ~c.i[6] ) | ( t.i[6] & c.i[6] ); + m.i[7] = ( f.i[7] & ~c.i[7] ) | ( t.i[7] & c.i[7] ); + m.i[8] = ( f.i[8] & ~c.i[8] ) | ( t.i[8] & c.i[8] ); + m.i[9] = ( f.i[9] & ~c.i[9] ) | ( t.i[9] & c.i[9] ); m.i[10] = ( f.i[10] & ~c.i[10] ) | ( t.i[10] & c.i[10] ); m.i[11] = ( f.i[11] & ~c.i[11] ) | ( t.i[11] & c.i[11] ); m.i[12] = ( f.i[12] & ~c.i[12] ) | ( t.i[12] & c.i[12] ); @@ -3411,186 +3430,263 @@ namespace v16 m.i[15] = ( f.i[15] & ~c.i[15] ) | ( t.i[15] & c.i[15] ); return m; - } +} - //////////////// - // v16float class +//////////////// +// v16float class - class v16float : public v16 - { +class v16float : public v16 +{ // v16float prefix unary operator friends - friend inline v16float operator +( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator ~( const v16float &a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16float &a ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator~( const v16float& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16float prefix increment / decrement operator friends - friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a ) ALWAYS_INLINE; // v16float postfix increment / decrement operator friends - friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a, int ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a, int ) ALWAYS_INLINE; // v16float binary operator friends - friend inline v16float operator +( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator *( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator /( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator*(const v16float& a, + const v16float& b)ALWAYS_INLINE; + friend inline v16float operator/( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float math library friends -# define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v16float fn( const v16float &a, \ - const v16float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v16float fn( const v16float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v16float fn( const v16float& a, const v16float& b ) \ + ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v16float miscellaneous friends - friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rsqrt ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; + friend inline v16float rsqrt_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rsqrt( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp( const v16float& a ) ALWAYS_INLINE; + friend inline v16float fma( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fnms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline void increment_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void decrement_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void scale_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; public: - // v16float constructors / destructors - v16float() {} // Default constructor + v16float() {} // Default constructor - v16float( const v16float &a ) // Copy constructor + v16float( const v16float& a ) // Copy constructor { - f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3]; - f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7]; - f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11]; - f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; + f[8] = a.f[8]; + f[9] = a.f[9]; + f[10] = a.f[10]; + f[11] = a.f[11]; + f[12] = a.f[12]; + f[13] = a.f[13]; + f[14] = a.f[14]; + f[15] = a.f[15]; } - v16float( const v16 &a ) // Init from mixed + v16float( const v16& a ) // Init from mixed { - f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3]; - f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7]; - f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11]; - f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; + f[8] = a.f[8]; + f[9] = a.f[9]; + f[10] = a.f[10]; + f[11] = a.f[11]; + f[12] = a.f[12]; + f[13] = a.f[13]; + f[14] = a.f[14]; + f[15] = a.f[15]; } - v16float( float a ) // Init from scalar + v16float( float a ) // Init from scalar { - f[ 0] = a; f[ 1] = a; f[ 2] = a; f[ 3] = a; - f[ 4] = a; f[ 5] = a; f[ 6] = a; f[ 7] = a; - f[ 8] = a; f[ 9] = a; f[10] = a; f[11] = a; - f[12] = a; f[13] = a; f[14] = a; f[15] = a; + f[0] = a; + f[1] = a; + f[2] = a; + f[3] = a; + f[4] = a; + f[5] = a; + f[6] = a; + f[7] = a; + f[8] = a; + f[9] = a; + f[10] = a; + f[11] = a; + f[12] = a; + f[13] = a; + f[14] = a; + f[15] = a; } - v16float( float f00, float f01, float f02, float f03, - float f04, float f05, float f06, float f07, - float f08, float f09, float f10, float f11, - float f12, float f13, float f14, float f15 ) // Init from scalars + v16float( float f00, float f01, float f02, float f03, float f04, float f05, + float f06, float f07, float f08, float f09, float f10, float f11, + float f12, float f13, float f14, float f15 ) // Init from scalars { - f[ 0] = f00; f[ 1] = f01; f[ 2] = f02; f[ 3] = f03; - f[ 4] = f04; f[ 5] = f05; f[ 6] = f06; f[ 7] = f07; - f[ 8] = f08; f[ 9] = f09; f[10] = f10; f[11] = f11; - f[12] = f12; f[13] = f13; f[14] = f14; f[15] = f15; + f[0] = f00; + f[1] = f01; + f[2] = f02; + f[3] = f03; + f[4] = f04; + f[5] = f05; + f[6] = f06; + f[7] = f07; + f[8] = f08; + f[9] = f09; + f[10] = f10; + f[11] = f11; + f[12] = f12; + f[13] = f13; + f[14] = f14; + f[15] = f15; } - ~v16float() {} // Destructor + ~v16float() {} // Destructor // v16float assignment operators -# define ASSIGN(op) \ - inline v16float &operator op( const v16float &b ) \ - { \ - f[ 0] op b.f[ 0]; \ - f[ 1] op b.f[ 1]; \ - f[ 2] op b.f[ 2]; \ - f[ 3] op b.f[ 3]; \ - f[ 4] op b.f[ 4]; \ - f[ 5] op b.f[ 5]; \ - f[ 6] op b.f[ 6]; \ - f[ 7] op b.f[ 7]; \ - f[ 8] op b.f[ 8]; \ - f[ 9] op b.f[ 9]; \ - f[10] op b.f[10]; \ - f[11] op b.f[11]; \ - f[12] op b.f[12]; \ - f[13] op b.f[13]; \ - f[14] op b.f[14]; \ - f[15] op b.f[15]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16float& operator op( const v16float& b ) \ + { \ + f[0] op b.f[0]; \ + f[1] op b.f[1]; \ + f[2] op b.f[2]; \ + f[3] op b.f[3]; \ + f[4] op b.f[4]; \ + f[5] op b.f[5]; \ + f[6] op b.f[6]; \ + f[7] op b.f[7]; \ + f[8] op b.f[8]; \ + f[9] op b.f[9]; \ + f[10] op b.f[10]; \ + f[11] op b.f[11]; \ + f[12] op b.f[12]; \ + f[13] op b.f[13]; \ + f[14] op b.f[14]; \ + f[15] op b.f[15]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) -# undef ASSIGN +#undef ASSIGN // v16float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v16float prefix unary operators +// v16float prefix unary operators - inline v16float operator +( const v16float &a ) - { +inline v16float operator+( const v16float& a ) +{ v16float b; - b.f[ 0] = +a.f[ 0]; - b.f[ 1] = +a.f[ 1]; - b.f[ 2] = +a.f[ 2]; - b.f[ 3] = +a.f[ 3]; - b.f[ 4] = +a.f[ 4]; - b.f[ 5] = +a.f[ 5]; - b.f[ 6] = +a.f[ 6]; - b.f[ 7] = +a.f[ 7]; - b.f[ 8] = +a.f[ 8]; - b.f[ 9] = +a.f[ 9]; + b.f[0] = +a.f[0]; + b.f[1] = +a.f[1]; + b.f[2] = +a.f[2]; + b.f[3] = +a.f[3]; + b.f[4] = +a.f[4]; + b.f[5] = +a.f[5]; + b.f[6] = +a.f[6]; + b.f[7] = +a.f[7]; + b.f[8] = +a.f[8]; + b.f[9] = +a.f[9]; b.f[10] = +a.f[10]; b.f[11] = +a.f[11]; b.f[12] = +a.f[12]; @@ -3599,22 +3695,22 @@ namespace v16 b.f[15] = +a.f[15]; return b; - } +} - inline v16float operator -( const v16float &a ) - { +inline v16float operator-( const v16float& a ) +{ v16float b; - b.f[ 0] = -a.f[ 0]; - b.f[ 1] = -a.f[ 1]; - b.f[ 2] = -a.f[ 2]; - b.f[ 3] = -a.f[ 3]; - b.f[ 4] = -a.f[ 4]; - b.f[ 5] = -a.f[ 5]; - b.f[ 6] = -a.f[ 6]; - b.f[ 7] = -a.f[ 7]; - b.f[ 8] = -a.f[ 8]; - b.f[ 9] = -a.f[ 9]; + b.f[0] = -a.f[0]; + b.f[1] = -a.f[1]; + b.f[2] = -a.f[2]; + b.f[3] = -a.f[3]; + b.f[4] = -a.f[4]; + b.f[5] = -a.f[5]; + b.f[6] = -a.f[6]; + b.f[7] = -a.f[7]; + b.f[8] = -a.f[8]; + b.f[9] = -a.f[9]; b.f[10] = -a.f[10]; b.f[11] = -a.f[11]; b.f[12] = -a.f[12]; @@ -3623,22 +3719,22 @@ namespace v16 b.f[15] = -a.f[15]; return b; - } +} - inline v16int operator !( const v16float &a ) - { +inline v16int operator!( const v16float& a ) +{ v16int b; - b.i[ 0] = a.i[ 0] ? 0 : -1; - b.i[ 1] = a.i[ 1] ? 0 : -1; - b.i[ 2] = a.i[ 2] ? 0 : -1; - b.i[ 3] = a.i[ 3] ? 0 : -1; - b.i[ 4] = a.i[ 4] ? 0 : -1; - b.i[ 5] = a.i[ 5] ? 0 : -1; - b.i[ 6] = a.i[ 6] ? 0 : -1; - b.i[ 7] = a.i[ 7] ? 0 : -1; - b.i[ 8] = a.i[ 8] ? 0 : -1; - b.i[ 9] = a.i[ 9] ? 0 : -1; + b.i[0] = a.i[0] ? 0 : -1; + b.i[1] = a.i[1] ? 0 : -1; + b.i[2] = a.i[2] ? 0 : -1; + b.i[3] = a.i[3] ? 0 : -1; + b.i[4] = a.i[4] ? 0 : -1; + b.i[5] = a.i[5] ? 0 : -1; + b.i[6] = a.i[6] ? 0 : -1; + b.i[7] = a.i[7] ? 0 : -1; + b.i[8] = a.i[8] ? 0 : -1; + b.i[9] = a.i[9] ? 0 : -1; b.i[10] = a.i[10] ? 0 : -1; b.i[11] = a.i[11] ? 0 : -1; b.i[12] = a.i[12] ? 0 : -1; @@ -3647,24 +3743,24 @@ namespace v16 b.i[15] = a.i[15] ? 0 : -1; return b; - } +} - // v16float prefix increment / decrement operators +// v16float prefix increment / decrement operators - inline v16float operator ++( v16float &a ) - { +inline v16float operator++( v16float& a ) +{ v16float b; - b.f[ 0] = ++a.f[ 0]; - b.f[ 1] = ++a.f[ 1]; - b.f[ 2] = ++a.f[ 2]; - b.f[ 3] = ++a.f[ 3]; - b.f[ 4] = ++a.f[ 4]; - b.f[ 5] = ++a.f[ 5]; - b.f[ 6] = ++a.f[ 6]; - b.f[ 7] = ++a.f[ 7]; - b.f[ 8] = ++a.f[ 8]; - b.f[ 9] = ++a.f[ 9]; + b.f[0] = ++a.f[0]; + b.f[1] = ++a.f[1]; + b.f[2] = ++a.f[2]; + b.f[3] = ++a.f[3]; + b.f[4] = ++a.f[4]; + b.f[5] = ++a.f[5]; + b.f[6] = ++a.f[6]; + b.f[7] = ++a.f[7]; + b.f[8] = ++a.f[8]; + b.f[9] = ++a.f[9]; b.f[10] = ++a.f[10]; b.f[11] = ++a.f[11]; b.f[12] = ++a.f[12]; @@ -3673,22 +3769,22 @@ namespace v16 b.f[15] = ++a.f[15]; return b; - } +} - inline v16float operator --( v16float &a ) - { +inline v16float operator--( v16float& a ) +{ v16float b; - b.f[ 0] = --a.f[ 0]; - b.f[ 1] = --a.f[ 1]; - b.f[ 2] = --a.f[ 2]; - b.f[ 3] = --a.f[ 3]; - b.f[ 4] = --a.f[ 4]; - b.f[ 5] = --a.f[ 5]; - b.f[ 6] = --a.f[ 6]; - b.f[ 7] = --a.f[ 7]; - b.f[ 8] = --a.f[ 8]; - b.f[ 9] = --a.f[ 9]; + b.f[0] = --a.f[0]; + b.f[1] = --a.f[1]; + b.f[2] = --a.f[2]; + b.f[3] = --a.f[3]; + b.f[4] = --a.f[4]; + b.f[5] = --a.f[5]; + b.f[6] = --a.f[6]; + b.f[7] = --a.f[7]; + b.f[8] = --a.f[8]; + b.f[9] = --a.f[9]; b.f[10] = --a.f[10]; b.f[11] = --a.f[11]; b.f[12] = --a.f[12]; @@ -3697,24 +3793,24 @@ namespace v16 b.f[15] = --a.f[15]; return b; - } +} - // v16float postfix increment / decrement operators +// v16float postfix increment / decrement operators - inline v16float operator ++( v16float &a, int ) - { +inline v16float operator++( v16float& a, int ) +{ v16float b; - b.f[ 0] = a.f[ 0]++; - b.f[ 1] = a.f[ 1]++; - b.f[ 2] = a.f[ 2]++; - b.f[ 3] = a.f[ 3]++; - b.f[ 4] = a.f[ 4]++; - b.f[ 5] = a.f[ 5]++; - b.f[ 6] = a.f[ 6]++; - b.f[ 7] = a.f[ 7]++; - b.f[ 8] = a.f[ 8]++; - b.f[ 9] = a.f[ 9]++; + b.f[0] = a.f[0]++; + b.f[1] = a.f[1]++; + b.f[2] = a.f[2]++; + b.f[3] = a.f[3]++; + b.f[4] = a.f[4]++; + b.f[5] = a.f[5]++; + b.f[6] = a.f[6]++; + b.f[7] = a.f[7]++; + b.f[8] = a.f[8]++; + b.f[9] = a.f[9]++; b.f[10] = a.f[10]++; b.f[11] = a.f[11]++; b.f[12] = a.f[12]++; @@ -3723,22 +3819,22 @@ namespace v16 b.f[15] = a.f[15]++; return b; - } +} - inline v16float operator --( v16float &a, int ) - { +inline v16float operator--( v16float& a, int ) +{ v16float b; - b.f[ 0] = a.f[ 0]--; - b.f[ 1] = a.f[ 1]--; - b.f[ 2] = a.f[ 2]--; - b.f[ 3] = a.f[ 3]--; - b.f[ 4] = a.f[ 4]--; - b.f[ 5] = a.f[ 5]--; - b.f[ 6] = a.f[ 6]--; - b.f[ 7] = a.f[ 7]--; - b.f[ 8] = a.f[ 8]--; - b.f[ 9] = a.f[ 9]--; + b.f[0] = a.f[0]--; + b.f[1] = a.f[1]--; + b.f[2] = a.f[2]--; + b.f[3] = a.f[3]--; + b.f[4] = a.f[4]--; + b.f[5] = a.f[5]--; + b.f[6] = a.f[6]--; + b.f[7] = a.f[7]--; + b.f[8] = a.f[8]--; + b.f[9] = a.f[9]--; b.f[10] = a.f[10]--; b.f[11] = a.f[11]--; b.f[12] = a.f[12]--; @@ -3747,317 +3843,335 @@ namespace v16 b.f[15] = a.f[15]--; return b; - } - - // v16float binary operators - -# define BINARY(op) \ - inline v16float operator op( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - c.f[ 0] = a.f[ 0] op b.f[ 0]; \ - c.f[ 1] = a.f[ 1] op b.f[ 1]; \ - c.f[ 2] = a.f[ 2] op b.f[ 2]; \ - c.f[ 3] = a.f[ 3] op b.f[ 3]; \ - c.f[ 4] = a.f[ 4] op b.f[ 4]; \ - c.f[ 5] = a.f[ 5] op b.f[ 5]; \ - c.f[ 6] = a.f[ 6] op b.f[ 6]; \ - c.f[ 7] = a.f[ 7] op b.f[ 7]; \ - c.f[ 8] = a.f[ 8] op b.f[ 8]; \ - c.f[ 9] = a.f[ 9] op b.f[ 9]; \ - c.f[10] = a.f[10] op b.f[10]; \ - c.f[11] = a.f[11] op b.f[11]; \ - c.f[12] = a.f[12] op b.f[12]; \ - c.f[13] = a.f[13] op b.f[13]; \ - c.f[14] = a.f[14] op b.f[14]; \ - c.f[15] = a.f[15] op b.f[15]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - -# undef BINARY - - // v16float logical operators - -# define LOGICAL(op) \ - inline v16int operator op( const v16float &a, const v16float &b ) \ - { \ - v16int c; \ - c.i[ 0] = -( a.f[ 0] op b.f[ 0] ); \ - c.i[ 1] = -( a.f[ 1] op b.f[ 1] ); \ - c.i[ 2] = -( a.f[ 2] op b.f[ 2] ); \ - c.i[ 3] = -( a.f[ 3] op b.f[ 3] ); \ - c.i[ 4] = -( a.f[ 4] op b.f[ 4] ); \ - c.i[ 5] = -( a.f[ 5] op b.f[ 5] ); \ - c.i[ 6] = -( a.f[ 6] op b.f[ 6] ); \ - c.i[ 7] = -( a.f[ 7] op b.f[ 7] ); \ - c.i[ 8] = -( a.f[ 8] op b.f[ 8] ); \ - c.i[ 9] = -( a.f[ 9] op b.f[ 9] ); \ - c.i[10] = -( a.f[10] op b.f[10] ); \ - c.i[11] = -( a.f[11] op b.f[11] ); \ - c.i[12] = -( a.f[12] op b.f[12] ); \ - c.i[13] = -( a.f[13] op b.f[13] ); \ - c.i[14] = -( a.f[14] op b.f[14] ); \ - c.i[15] = -( a.f[15] op b.f[15] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v16float math library functions - -# define CMATH_FR1(fn) \ - inline v16float fn( const v16float &a ) \ - { \ - v16float b; \ - b.f[ 0] = ::fn( a.f[ 0] ); \ - b.f[ 1] = ::fn( a.f[ 1] ); \ - b.f[ 2] = ::fn( a.f[ 2] ); \ - b.f[ 3] = ::fn( a.f[ 3] ); \ - b.f[ 4] = ::fn( a.f[ 4] ); \ - b.f[ 5] = ::fn( a.f[ 5] ); \ - b.f[ 6] = ::fn( a.f[ 6] ); \ - b.f[ 7] = ::fn( a.f[ 7] ); \ - b.f[ 8] = ::fn( a.f[ 8] ); \ - b.f[ 9] = ::fn( a.f[ 9] ); \ - b.f[10] = ::fn( a.f[10] ); \ - b.f[11] = ::fn( a.f[11] ); \ - b.f[12] = ::fn( a.f[12] ); \ - b.f[13] = ::fn( a.f[13] ); \ - b.f[14] = ::fn( a.f[14] ); \ - b.f[15] = ::fn( a.f[15] ); \ - return b; \ - } - -# define CMATH_FR2(fn) \ - inline v16float fn( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - c.f[ 0] = ::fn( a.f[ 0], b.f[ 0] ); \ - c.f[ 1] = ::fn( a.f[ 1], b.f[ 1] ); \ - c.f[ 2] = ::fn( a.f[ 2], b.f[ 2] ); \ - c.f[ 3] = ::fn( a.f[ 3], b.f[ 3] ); \ - c.f[ 4] = ::fn( a.f[ 4], b.f[ 4] ); \ - c.f[ 5] = ::fn( a.f[ 5], b.f[ 5] ); \ - c.f[ 6] = ::fn( a.f[ 6], b.f[ 6] ); \ - c.f[ 7] = ::fn( a.f[ 7], b.f[ 7] ); \ - c.f[ 8] = ::fn( a.f[ 8], b.f[ 8] ); \ - c.f[ 9] = ::fn( a.f[ 9], b.f[ 9] ); \ - c.f[10] = ::fn( a.f[10], b.f[10] ); \ - c.f[11] = ::fn( a.f[11], b.f[11] ); \ - c.f[12] = ::fn( a.f[12], b.f[12] ); \ - c.f[13] = ::fn( a.f[13], b.f[13] ); \ - c.f[14] = ::fn( a.f[14], b.f[14] ); \ - c.f[15] = ::fn( a.f[15], b.f[15] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - inline v16float copysign( const v16float &a, const v16float &b ) - { - v16float c; - float t; - - t = ::fabs( a.f[ 0] ); - if( b.f[ 0] < 0 ) t = -t; - c.f[ 0] = t; - - t = ::fabs( a.f[ 1] ); - if( b.f[ 1] < 0 ) t = -t; - c.f[ 1] = t; - - t = ::fabs( a.f[ 2] ); - if( b.f[ 2] < 0 ) t = -t; - c.f[ 2] = t; - - t = ::fabs( a.f[ 3] ); - if( b.f[ 3] < 0 ) t = -t; - c.f[ 3] = t; +} + +// v16float binary operators + +#define BINARY( op ) \ + inline v16float operator op( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + c.f[0] = a.f[0] op b.f[0]; \ + c.f[1] = a.f[1] op b.f[1]; \ + c.f[2] = a.f[2] op b.f[2]; \ + c.f[3] = a.f[3] op b.f[3]; \ + c.f[4] = a.f[4] op b.f[4]; \ + c.f[5] = a.f[5] op b.f[5]; \ + c.f[6] = a.f[6] op b.f[6]; \ + c.f[7] = a.f[7] op b.f[7]; \ + c.f[8] = a.f[8] op b.f[8]; \ + c.f[9] = a.f[9] op b.f[9]; \ + c.f[10] = a.f[10] op b.f[10]; \ + c.f[11] = a.f[11] op b.f[11]; \ + c.f[12] = a.f[12] op b.f[12]; \ + c.f[13] = a.f[13] op b.f[13]; \ + c.f[14] = a.f[14] op b.f[14]; \ + c.f[15] = a.f[15] op b.f[15]; \ + return c; \ + } - t = ::fabs( a.f[ 4] ); - if( b.f[ 4] < 0 ) t = -t; - c.f[ 4] = t; +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v16float logical operators + +#define LOGICAL( op ) \ + inline v16int operator op( const v16float& a, const v16float& b ) \ + { \ + v16int c; \ + c.i[0] = -( a.f[0] op b.f[0] ); \ + c.i[1] = -( a.f[1] op b.f[1] ); \ + c.i[2] = -( a.f[2] op b.f[2] ); \ + c.i[3] = -( a.f[3] op b.f[3] ); \ + c.i[4] = -( a.f[4] op b.f[4] ); \ + c.i[5] = -( a.f[5] op b.f[5] ); \ + c.i[6] = -( a.f[6] op b.f[6] ); \ + c.i[7] = -( a.f[7] op b.f[7] ); \ + c.i[8] = -( a.f[8] op b.f[8] ); \ + c.i[9] = -( a.f[9] op b.f[9] ); \ + c.i[10] = -( a.f[10] op b.f[10] ); \ + c.i[11] = -( a.f[11] op b.f[11] ); \ + c.i[12] = -( a.f[12] op b.f[12] ); \ + c.i[13] = -( a.f[13] op b.f[13] ); \ + c.i[14] = -( a.f[14] op b.f[14] ); \ + c.i[15] = -( a.f[15] op b.f[15] ); \ + return c; \ + } - t = ::fabs( a.f[ 5] ); - if( b.f[ 5] < 0 ) t = -t; - c.f[ 5] = t; +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v16float math library functions + +#define CMATH_FR1( fn ) \ + inline v16float fn( const v16float& a ) \ + { \ + v16float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + b.f[4] = ::fn( a.f[4] ); \ + b.f[5] = ::fn( a.f[5] ); \ + b.f[6] = ::fn( a.f[6] ); \ + b.f[7] = ::fn( a.f[7] ); \ + b.f[8] = ::fn( a.f[8] ); \ + b.f[9] = ::fn( a.f[9] ); \ + b.f[10] = ::fn( a.f[10] ); \ + b.f[11] = ::fn( a.f[11] ); \ + b.f[12] = ::fn( a.f[12] ); \ + b.f[13] = ::fn( a.f[13] ); \ + b.f[14] = ::fn( a.f[14] ); \ + b.f[15] = ::fn( a.f[15] ); \ + return b; \ + } - t = ::fabs( a.f[ 6] ); - if( b.f[ 6] < 0 ) t = -t; - c.f[ 6] = t; +#define CMATH_FR2( fn ) \ + inline v16float fn( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + c.f[4] = ::fn( a.f[4], b.f[4] ); \ + c.f[5] = ::fn( a.f[5], b.f[5] ); \ + c.f[6] = ::fn( a.f[6], b.f[6] ); \ + c.f[7] = ::fn( a.f[7], b.f[7] ); \ + c.f[8] = ::fn( a.f[8], b.f[8] ); \ + c.f[9] = ::fn( a.f[9], b.f[9] ); \ + c.f[10] = ::fn( a.f[10], b.f[10] ); \ + c.f[11] = ::fn( a.f[11], b.f[11] ); \ + c.f[12] = ::fn( a.f[12], b.f[12] ); \ + c.f[13] = ::fn( a.f[13], b.f[13] ); \ + c.f[14] = ::fn( a.f[14], b.f[14] ); \ + c.f[15] = ::fn( a.f[15], b.f[15] ); \ + return c; \ + } - t = ::fabs( a.f[ 7] ); - if( b.f[ 7] < 0 ) t = -t; - c.f[ 7] = t; +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) - t = ::fabs( a.f[ 8] ); - if( b.f[ 8] < 0 ) t = -t; - c.f[ 8] = t; + inline v16float + copysign( const v16float& a, const v16float& b ) +{ + v16float c; + float t; - t = ::fabs( a.f[ 9] ); - if( b.f[ 9] < 0 ) t = -t; - c.f[ 9] = t; + t = ::fabs( a.f[0] ); + if ( b.f[0] < 0 ) + t = -t; + c.f[0] = t; + + t = ::fabs( a.f[1] ); + if ( b.f[1] < 0 ) + t = -t; + c.f[1] = t; + + t = ::fabs( a.f[2] ); + if ( b.f[2] < 0 ) + t = -t; + c.f[2] = t; + + t = ::fabs( a.f[3] ); + if ( b.f[3] < 0 ) + t = -t; + c.f[3] = t; + + t = ::fabs( a.f[4] ); + if ( b.f[4] < 0 ) + t = -t; + c.f[4] = t; + + t = ::fabs( a.f[5] ); + if ( b.f[5] < 0 ) + t = -t; + c.f[5] = t; + + t = ::fabs( a.f[6] ); + if ( b.f[6] < 0 ) + t = -t; + c.f[6] = t; + + t = ::fabs( a.f[7] ); + if ( b.f[7] < 0 ) + t = -t; + c.f[7] = t; + + t = ::fabs( a.f[8] ); + if ( b.f[8] < 0 ) + t = -t; + c.f[8] = t; + + t = ::fabs( a.f[9] ); + if ( b.f[9] < 0 ) + t = -t; + c.f[9] = t; t = ::fabs( a.f[10] ); - if( b.f[10] < 0 ) t = -t; + if ( b.f[10] < 0 ) + t = -t; c.f[10] = t; t = ::fabs( a.f[11] ); - if( b.f[11] < 0 ) t = -t; + if ( b.f[11] < 0 ) + t = -t; c.f[11] = t; t = ::fabs( a.f[12] ); - if( b.f[12] < 0 ) t = -t; + if ( b.f[12] < 0 ) + t = -t; c.f[12] = t; t = ::fabs( a.f[13] ); - if( b.f[13] < 0 ) t = -t; + if ( b.f[13] < 0 ) + t = -t; c.f[13] = t; t = ::fabs( a.f[14] ); - if( b.f[14] < 0 ) t = -t; + if ( b.f[14] < 0 ) + t = -t; c.f[14] = t; t = ::fabs( a.f[15] ); - if( b.f[15] < 0 ) t = -t; + if ( b.f[15] < 0 ) + t = -t; c.f[15] = t; return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v16float miscellaneous functions +// v16float miscellaneous functions - inline v16float rsqrt_approx( const v16float &a ) - { +inline v16float rsqrt_approx( const v16float& a ) +{ v16float b; - b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] ); - b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] ); - b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] ); - b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] ); - b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] ); - b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] ); - b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] ); - b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] ); - b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] ); - b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] ); - b.f[10] = ::sqrt( 1.0f/a.f[10] ); - b.f[11] = ::sqrt( 1.0f/a.f[11] ); - b.f[12] = ::sqrt( 1.0f/a.f[12] ); - b.f[13] = ::sqrt( 1.0f/a.f[13] ); - b.f[14] = ::sqrt( 1.0f/a.f[14] ); - b.f[15] = ::sqrt( 1.0f/a.f[15] ); + b.f[0] = ::sqrt( 1.0f / a.f[0] ); + b.f[1] = ::sqrt( 1.0f / a.f[1] ); + b.f[2] = ::sqrt( 1.0f / a.f[2] ); + b.f[3] = ::sqrt( 1.0f / a.f[3] ); + b.f[4] = ::sqrt( 1.0f / a.f[4] ); + b.f[5] = ::sqrt( 1.0f / a.f[5] ); + b.f[6] = ::sqrt( 1.0f / a.f[6] ); + b.f[7] = ::sqrt( 1.0f / a.f[7] ); + b.f[8] = ::sqrt( 1.0f / a.f[8] ); + b.f[9] = ::sqrt( 1.0f / a.f[9] ); + b.f[10] = ::sqrt( 1.0f / a.f[10] ); + b.f[11] = ::sqrt( 1.0f / a.f[11] ); + b.f[12] = ::sqrt( 1.0f / a.f[12] ); + b.f[13] = ::sqrt( 1.0f / a.f[13] ); + b.f[14] = ::sqrt( 1.0f / a.f[14] ); + b.f[15] = ::sqrt( 1.0f / a.f[15] ); return b; - } +} - inline v16float rsqrt( const v16float &a ) - { +inline v16float rsqrt( const v16float& a ) +{ v16float b; - b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] ); - b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] ); - b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] ); - b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] ); - b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] ); - b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] ); - b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] ); - b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] ); - b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] ); - b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] ); - b.f[10] = ::sqrt( 1.0f/a.f[10] ); - b.f[11] = ::sqrt( 1.0f/a.f[11] ); - b.f[12] = ::sqrt( 1.0f/a.f[12] ); - b.f[13] = ::sqrt( 1.0f/a.f[13] ); - b.f[14] = ::sqrt( 1.0f/a.f[14] ); - b.f[15] = ::sqrt( 1.0f/a.f[15] ); + b.f[0] = ::sqrt( 1.0f / a.f[0] ); + b.f[1] = ::sqrt( 1.0f / a.f[1] ); + b.f[2] = ::sqrt( 1.0f / a.f[2] ); + b.f[3] = ::sqrt( 1.0f / a.f[3] ); + b.f[4] = ::sqrt( 1.0f / a.f[4] ); + b.f[5] = ::sqrt( 1.0f / a.f[5] ); + b.f[6] = ::sqrt( 1.0f / a.f[6] ); + b.f[7] = ::sqrt( 1.0f / a.f[7] ); + b.f[8] = ::sqrt( 1.0f / a.f[8] ); + b.f[9] = ::sqrt( 1.0f / a.f[9] ); + b.f[10] = ::sqrt( 1.0f / a.f[10] ); + b.f[11] = ::sqrt( 1.0f / a.f[11] ); + b.f[12] = ::sqrt( 1.0f / a.f[12] ); + b.f[13] = ::sqrt( 1.0f / a.f[13] ); + b.f[14] = ::sqrt( 1.0f / a.f[14] ); + b.f[15] = ::sqrt( 1.0f / a.f[15] ); return b; - } +} - inline v16float rcp_approx( const v16float &a ) - { +inline v16float rcp_approx( const v16float& a ) +{ v16float b; - b.f[ 0] = 1.0f/a.f[ 0]; - b.f[ 1] = 1.0f/a.f[ 1]; - b.f[ 2] = 1.0f/a.f[ 2]; - b.f[ 3] = 1.0f/a.f[ 3]; - b.f[ 4] = 1.0f/a.f[ 4]; - b.f[ 5] = 1.0f/a.f[ 5]; - b.f[ 6] = 1.0f/a.f[ 6]; - b.f[ 7] = 1.0f/a.f[ 7]; - b.f[ 8] = 1.0f/a.f[ 8]; - b.f[ 9] = 1.0f/a.f[ 9]; - b.f[10] = 1.0f/a.f[10]; - b.f[11] = 1.0f/a.f[11]; - b.f[12] = 1.0f/a.f[12]; - b.f[13] = 1.0f/a.f[13]; - b.f[14] = 1.0f/a.f[14]; - b.f[15] = 1.0f/a.f[15]; + b.f[0] = 1.0f / a.f[0]; + b.f[1] = 1.0f / a.f[1]; + b.f[2] = 1.0f / a.f[2]; + b.f[3] = 1.0f / a.f[3]; + b.f[4] = 1.0f / a.f[4]; + b.f[5] = 1.0f / a.f[5]; + b.f[6] = 1.0f / a.f[6]; + b.f[7] = 1.0f / a.f[7]; + b.f[8] = 1.0f / a.f[8]; + b.f[9] = 1.0f / a.f[9]; + b.f[10] = 1.0f / a.f[10]; + b.f[11] = 1.0f / a.f[11]; + b.f[12] = 1.0f / a.f[12]; + b.f[13] = 1.0f / a.f[13]; + b.f[14] = 1.0f / a.f[14]; + b.f[15] = 1.0f / a.f[15]; return b; - } +} - inline v16float rcp( const v16float &a ) - { +inline v16float rcp( const v16float& a ) +{ v16float b; - b.f[ 0] = 1.0f/a.f[ 0]; - b.f[ 1] = 1.0f/a.f[ 1]; - b.f[ 2] = 1.0f/a.f[ 2]; - b.f[ 3] = 1.0f/a.f[ 3]; - b.f[ 4] = 1.0f/a.f[ 4]; - b.f[ 5] = 1.0f/a.f[ 5]; - b.f[ 6] = 1.0f/a.f[ 6]; - b.f[ 7] = 1.0f/a.f[ 7]; - b.f[ 8] = 1.0f/a.f[ 8]; - b.f[ 9] = 1.0f/a.f[ 9]; - b.f[10] = 1.0f/a.f[10]; - b.f[11] = 1.0f/a.f[11]; - b.f[12] = 1.0f/a.f[12]; - b.f[13] = 1.0f/a.f[13]; - b.f[14] = 1.0f/a.f[14]; - b.f[15] = 1.0f/a.f[15]; + b.f[0] = 1.0f / a.f[0]; + b.f[1] = 1.0f / a.f[1]; + b.f[2] = 1.0f / a.f[2]; + b.f[3] = 1.0f / a.f[3]; + b.f[4] = 1.0f / a.f[4]; + b.f[5] = 1.0f / a.f[5]; + b.f[6] = 1.0f / a.f[6]; + b.f[7] = 1.0f / a.f[7]; + b.f[8] = 1.0f / a.f[8]; + b.f[9] = 1.0f / a.f[9]; + b.f[10] = 1.0f / a.f[10]; + b.f[11] = 1.0f / a.f[11]; + b.f[12] = 1.0f / a.f[12]; + b.f[13] = 1.0f / a.f[13]; + b.f[14] = 1.0f / a.f[14]; + b.f[15] = 1.0f / a.f[15]; return b; - } +} - inline v16float fma( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fma( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; - d.f[ 0] = a.f[ 0] * b.f[ 0] + c.f[ 0]; - d.f[ 1] = a.f[ 1] * b.f[ 1] + c.f[ 1]; - d.f[ 2] = a.f[ 2] * b.f[ 2] + c.f[ 2]; - d.f[ 3] = a.f[ 3] * b.f[ 3] + c.f[ 3]; - d.f[ 4] = a.f[ 4] * b.f[ 4] + c.f[ 4]; - d.f[ 5] = a.f[ 5] * b.f[ 5] + c.f[ 5]; - d.f[ 6] = a.f[ 6] * b.f[ 6] + c.f[ 6]; - d.f[ 7] = a.f[ 7] * b.f[ 7] + c.f[ 7]; - d.f[ 8] = a.f[ 8] * b.f[ 8] + c.f[ 8]; - d.f[ 9] = a.f[ 9] * b.f[ 9] + c.f[ 9]; + d.f[0] = a.f[0] * b.f[0] + c.f[0]; + d.f[1] = a.f[1] * b.f[1] + c.f[1]; + d.f[2] = a.f[2] * b.f[2] + c.f[2]; + d.f[3] = a.f[3] * b.f[3] + c.f[3]; + d.f[4] = a.f[4] * b.f[4] + c.f[4]; + d.f[5] = a.f[5] * b.f[5] + c.f[5]; + d.f[6] = a.f[6] * b.f[6] + c.f[6]; + d.f[7] = a.f[7] * b.f[7] + c.f[7]; + d.f[8] = a.f[8] * b.f[8] + c.f[8]; + d.f[9] = a.f[9] * b.f[9] + c.f[9]; d.f[10] = a.f[10] * b.f[10] + c.f[10]; d.f[11] = a.f[11] * b.f[11] + c.f[11]; d.f[12] = a.f[12] * b.f[12] + c.f[12]; @@ -4066,22 +4180,22 @@ namespace v16 d.f[15] = a.f[15] * b.f[15] + c.f[15]; return d; - } +} - inline v16float fms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; - d.f[ 0] = a.f[ 0] * b.f[ 0] - c.f[ 0]; - d.f[ 1] = a.f[ 1] * b.f[ 1] - c.f[ 1]; - d.f[ 2] = a.f[ 2] * b.f[ 2] - c.f[ 2]; - d.f[ 3] = a.f[ 3] * b.f[ 3] - c.f[ 3]; - d.f[ 4] = a.f[ 4] * b.f[ 4] - c.f[ 4]; - d.f[ 5] = a.f[ 5] * b.f[ 5] - c.f[ 5]; - d.f[ 6] = a.f[ 6] * b.f[ 6] - c.f[ 6]; - d.f[ 7] = a.f[ 7] * b.f[ 7] - c.f[ 7]; - d.f[ 8] = a.f[ 8] * b.f[ 8] - c.f[ 8]; - d.f[ 9] = a.f[ 9] * b.f[ 9] - c.f[ 9]; + d.f[0] = a.f[0] * b.f[0] - c.f[0]; + d.f[1] = a.f[1] * b.f[1] - c.f[1]; + d.f[2] = a.f[2] * b.f[2] - c.f[2]; + d.f[3] = a.f[3] * b.f[3] - c.f[3]; + d.f[4] = a.f[4] * b.f[4] - c.f[4]; + d.f[5] = a.f[5] * b.f[5] - c.f[5]; + d.f[6] = a.f[6] * b.f[6] - c.f[6]; + d.f[7] = a.f[7] * b.f[7] - c.f[7]; + d.f[8] = a.f[8] * b.f[8] - c.f[8]; + d.f[9] = a.f[9] * b.f[9] - c.f[9]; d.f[10] = a.f[10] * b.f[10] - c.f[10]; d.f[11] = a.f[11] * b.f[11] - c.f[11]; d.f[12] = a.f[12] * b.f[12] - c.f[12]; @@ -4090,22 +4204,22 @@ namespace v16 d.f[15] = a.f[15] * b.f[15] - c.f[15]; return d; - } +} - inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fnms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; - d.f[ 0] = c.f[ 0] - a.f[ 0] * b.f[ 0]; - d.f[ 1] = c.f[ 1] - a.f[ 1] * b.f[ 1]; - d.f[ 2] = c.f[ 2] - a.f[ 2] * b.f[ 2]; - d.f[ 3] = c.f[ 3] - a.f[ 3] * b.f[ 3]; - d.f[ 4] = c.f[ 4] - a.f[ 4] * b.f[ 4]; - d.f[ 5] = c.f[ 5] - a.f[ 5] * b.f[ 5]; - d.f[ 6] = c.f[ 6] - a.f[ 6] * b.f[ 6]; - d.f[ 7] = c.f[ 7] - a.f[ 7] * b.f[ 7]; - d.f[ 8] = c.f[ 8] - a.f[ 8] * b.f[ 8]; - d.f[ 9] = c.f[ 9] - a.f[ 9] * b.f[ 9]; + d.f[0] = c.f[0] - a.f[0] * b.f[0]; + d.f[1] = c.f[1] - a.f[1] * b.f[1]; + d.f[2] = c.f[2] - a.f[2] * b.f[2]; + d.f[3] = c.f[3] - a.f[3] * b.f[3]; + d.f[4] = c.f[4] - a.f[4] * b.f[4]; + d.f[5] = c.f[5] - a.f[5] * b.f[5]; + d.f[6] = c.f[6] - a.f[6] * b.f[6]; + d.f[7] = c.f[7] - a.f[7] * b.f[7]; + d.f[8] = c.f[8] - a.f[8] * b.f[8]; + d.f[9] = c.f[9] - a.f[9] * b.f[9]; d.f[10] = c.f[10] - a.f[10] * b.f[10]; d.f[11] = c.f[11] - a.f[11] * b.f[11]; d.f[12] = c.f[12] - a.f[12] * b.f[12]; @@ -4114,22 +4228,22 @@ namespace v16 d.f[15] = c.f[15] - a.f[15] * b.f[15]; return d; - } +} - inline v16float clear_bits( const v16int &m, const v16float &a ) - { +inline v16float clear_bits( const v16int& m, const v16float& a ) +{ v16float b; - b.i[ 0] = ( ~m.i[ 0] ) & a.i[ 0]; - b.i[ 1] = ( ~m.i[ 1] ) & a.i[ 1]; - b.i[ 2] = ( ~m.i[ 2] ) & a.i[ 2]; - b.i[ 3] = ( ~m.i[ 3] ) & a.i[ 3]; - b.i[ 4] = ( ~m.i[ 4] ) & a.i[ 4]; - b.i[ 5] = ( ~m.i[ 5] ) & a.i[ 5]; - b.i[ 6] = ( ~m.i[ 6] ) & a.i[ 6]; - b.i[ 7] = ( ~m.i[ 7] ) & a.i[ 7]; - b.i[ 8] = ( ~m.i[ 8] ) & a.i[ 8]; - b.i[ 9] = ( ~m.i[ 9] ) & a.i[ 9]; + b.i[0] = ( ~m.i[0] ) & a.i[0]; + b.i[1] = ( ~m.i[1] ) & a.i[1]; + b.i[2] = ( ~m.i[2] ) & a.i[2]; + b.i[3] = ( ~m.i[3] ) & a.i[3]; + b.i[4] = ( ~m.i[4] ) & a.i[4]; + b.i[5] = ( ~m.i[5] ) & a.i[5]; + b.i[6] = ( ~m.i[6] ) & a.i[6]; + b.i[7] = ( ~m.i[7] ) & a.i[7]; + b.i[8] = ( ~m.i[8] ) & a.i[8]; + b.i[9] = ( ~m.i[9] ) & a.i[9]; b.i[10] = ( ~m.i[10] ) & a.i[10]; b.i[11] = ( ~m.i[11] ) & a.i[11]; b.i[12] = ( ~m.i[12] ) & a.i[12]; @@ -4138,22 +4252,22 @@ namespace v16 b.i[15] = ( ~m.i[15] ) & a.i[15]; return b; - } +} - inline v16float set_bits( const v16int &m, const v16float &a ) - { +inline v16float set_bits( const v16int& m, const v16float& a ) +{ v16float b; - b.i[ 0] = m.i[ 0] | a.i[ 0]; - b.i[ 1] = m.i[ 1] | a.i[ 1]; - b.i[ 2] = m.i[ 2] | a.i[ 2]; - b.i[ 3] = m.i[ 3] | a.i[ 3]; - b.i[ 4] = m.i[ 4] | a.i[ 4]; - b.i[ 5] = m.i[ 5] | a.i[ 5]; - b.i[ 6] = m.i[ 6] | a.i[ 6]; - b.i[ 7] = m.i[ 7] | a.i[ 7]; - b.i[ 8] = m.i[ 8] | a.i[ 8]; - b.i[ 9] = m.i[ 9] | a.i[ 9]; + b.i[0] = m.i[0] | a.i[0]; + b.i[1] = m.i[1] | a.i[1]; + b.i[2] = m.i[2] | a.i[2]; + b.i[3] = m.i[3] | a.i[3]; + b.i[4] = m.i[4] | a.i[4]; + b.i[5] = m.i[5] | a.i[5]; + b.i[6] = m.i[6] | a.i[6]; + b.i[7] = m.i[7] | a.i[7]; + b.i[8] = m.i[8] | a.i[8]; + b.i[9] = m.i[9] | a.i[9]; b.i[10] = m.i[10] | a.i[10]; b.i[11] = m.i[11] | a.i[11]; b.i[12] = m.i[12] | a.i[12]; @@ -4162,22 +4276,22 @@ namespace v16 b.i[15] = m.i[15] | a.i[15]; return b; - } +} - inline v16float toggle_bits( const v16int &m, const v16float &a ) - { +inline v16float toggle_bits( const v16int& m, const v16float& a ) +{ v16float b; - b.i[ 0] = m.i[ 0] ^ a.i[ 0]; - b.i[ 1] = m.i[ 1] ^ a.i[ 1]; - b.i[ 2] = m.i[ 2] ^ a.i[ 2]; - b.i[ 3] = m.i[ 3] ^ a.i[ 3]; - b.i[ 4] = m.i[ 4] ^ a.i[ 4]; - b.i[ 5] = m.i[ 5] ^ a.i[ 5]; - b.i[ 6] = m.i[ 6] ^ a.i[ 6]; - b.i[ 7] = m.i[ 7] ^ a.i[ 7]; - b.i[ 8] = m.i[ 8] ^ a.i[ 8]; - b.i[ 9] = m.i[ 9] ^ a.i[ 9]; + b.i[0] = m.i[0] ^ a.i[0]; + b.i[1] = m.i[1] ^ a.i[1]; + b.i[2] = m.i[2] ^ a.i[2]; + b.i[3] = m.i[3] ^ a.i[3]; + b.i[4] = m.i[4] ^ a.i[4]; + b.i[5] = m.i[5] ^ a.i[5]; + b.i[6] = m.i[6] ^ a.i[6]; + b.i[7] = m.i[7] ^ a.i[7]; + b.i[8] = m.i[8] ^ a.i[8]; + b.i[9] = m.i[9] ^ a.i[9]; b.i[10] = m.i[10] ^ a.i[10]; b.i[11] = m.i[11] ^ a.i[11]; b.i[12] = m.i[12] ^ a.i[12]; @@ -4186,67 +4300,67 @@ namespace v16 b.i[15] = m.i[15] ^ a.i[15]; return b; - } - - inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) - { - p[ 0] += a.f[ 0]; - p[ 1] += a.f[ 1]; - p[ 2] += a.f[ 2]; - p[ 3] += a.f[ 3]; - p[ 4] += a.f[ 4]; - p[ 5] += a.f[ 5]; - p[ 6] += a.f[ 6]; - p[ 7] += a.f[ 7]; - p[ 8] += a.f[ 8]; - p[ 9] += a.f[ 9]; +} + +inline void increment_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ + p[0] += a.f[0]; + p[1] += a.f[1]; + p[2] += a.f[2]; + p[3] += a.f[3]; + p[4] += a.f[4]; + p[5] += a.f[5]; + p[6] += a.f[6]; + p[7] += a.f[7]; + p[8] += a.f[8]; + p[9] += a.f[9]; p[10] += a.f[10]; p[11] += a.f[11]; p[12] += a.f[12]; p[13] += a.f[13]; p[14] += a.f[14]; p[15] += a.f[15]; - } - - inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) - { - p[ 0] -= a.f[ 0]; - p[ 1] -= a.f[ 1]; - p[ 2] -= a.f[ 2]; - p[ 3] -= a.f[ 3]; - p[ 4] -= a.f[ 4]; - p[ 5] -= a.f[ 5]; - p[ 6] -= a.f[ 6]; - p[ 7] -= a.f[ 7]; - p[ 8] -= a.f[ 8]; - p[ 9] -= a.f[ 9]; +} + +inline void decrement_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ + p[0] -= a.f[0]; + p[1] -= a.f[1]; + p[2] -= a.f[2]; + p[3] -= a.f[3]; + p[4] -= a.f[4]; + p[5] -= a.f[5]; + p[6] -= a.f[6]; + p[7] -= a.f[7]; + p[8] -= a.f[8]; + p[9] -= a.f[9]; p[10] -= a.f[10]; p[11] -= a.f[11]; p[12] -= a.f[12]; p[13] -= a.f[13]; p[14] -= a.f[14]; p[15] -= a.f[15]; - } - - inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) - { - p[ 0] *= a.f[ 0]; - p[ 1] *= a.f[ 1]; - p[ 2] *= a.f[ 2]; - p[ 3] *= a.f[ 3]; - p[ 4] *= a.f[ 4]; - p[ 5] *= a.f[ 5]; - p[ 6] *= a.f[ 6]; - p[ 7] *= a.f[ 7]; - p[ 8] *= a.f[ 8]; - p[ 9] *= a.f[ 9]; +} + +inline void scale_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ + p[0] *= a.f[0]; + p[1] *= a.f[1]; + p[2] *= a.f[2]; + p[3] *= a.f[3]; + p[4] *= a.f[4]; + p[5] *= a.f[5]; + p[6] *= a.f[6]; + p[7] *= a.f[7]; + p[8] *= a.f[8]; + p[9] *= a.f[9]; p[10] *= a.f[10]; p[11] *= a.f[11]; p[12] *= a.f[12]; p[13] *= a.f[13]; p[14] *= a.f[14]; p[15] *= a.f[15]; - } +} } // namespace v16 diff --git a/src/util/v16/v16_portable_v0.h b/src/util/v16/v16_portable_v0.h index 084d1bb2..f69b8cda 100644 --- a/src/util/v16/v16_portable_v0.h +++ b/src/util/v16/v16_portable_v0.h @@ -11,384 +11,294 @@ #include #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v16 { - class v16; - class v16int; - class v16float; +class v16; +class v16int; +class v16float; - //////////////// - // v16 base class +//////////////// +// v16 base class - class v16 - { +class v16 +{ friend class v16int; friend class v16float; // v16 miscellaneous friends - friend inline int any( const v16 &a ) ALWAYS_INLINE; - friend inline int all( const v16 &a ) ALWAYS_INLINE; + friend inline int any( const v16& a ) ALWAYS_INLINE; + friend inline int all( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 splat( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 splat( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 shuffle( const v16& a ) ALWAYS_INLINE; - friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE; + friend inline void swap( v16& a, v16& b ) ALWAYS_INLINE; + friend inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, + v16& a04, v16& a05, v16& a06, v16& a07, + v16& a08, v16& a09, v16& a10, v16& a11, + v16& a12, v16& a13, v16& a14, + v16& a15 ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& a, + const v16& b ) ALWAYS_INLINE; // v16 memory manipulation friends - friend inline void load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE; - friend inline void store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE; - friend inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) ALWAYS_INLINE; - friend inline void swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE; + friend inline void load_16x1( const void* ALIGNED( 64 ) p, + v16& a ) ALWAYS_INLINE; + friend inline void store_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void stream_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void clear_16x1( void* ALIGNED( 64 ) dst ) ALWAYS_INLINE; + friend inline void copy_16x1( void* ALIGNED( 64 ) dst, + const void* ALIGNED( 64 ) src ) ALWAYS_INLINE; + friend inline void swap_16x1( void* ALIGNED( 64 ) a, + void* ALIGNED( 64 ) b ) ALWAYS_INLINE; // v16 transposed memory manipulation friends // Note: Half aligned values are permissible in the 16x2_tr variants. - friend inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) ALWAYS_INLINE; - friend inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE; - friend inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE; - friend inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - - friend inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE; - friend inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, - void * ALIGNED(8) a01, - void * ALIGNED(8) a02, - void * ALIGNED(8) a03, - void * ALIGNED(8) a04, - void * ALIGNED(8) a05, - void * ALIGNED(8) a06, - void * ALIGNED(8) a07, - void * ALIGNED(8) a08, - void * ALIGNED(8) a09, - void * ALIGNED(8) a10, - void * ALIGNED(8) a11, - void * ALIGNED(8) a12, - void * ALIGNED(8) a13, - void * ALIGNED(8) a14, - void * ALIGNED(8) a15 ) ALWAYS_INLINE; - friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x4_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x16_tr( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr_p( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) ALWAYS_INLINE; - friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; + friend inline void + load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) ALWAYS_INLINE; + friend inline void + load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) ALWAYS_INLINE; + friend inline void + load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) ALWAYS_INLINE; + friend inline void + load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) ALWAYS_INLINE; + friend inline void + load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, + v16& h ) ALWAYS_INLINE; + friend inline void + load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) ALWAYS_INLINE; + friend inline void load_16x8_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, v16& a, + v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) ALWAYS_INLINE; + friend inline void load_16x16_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, v16& b00, + v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, v16& b06, v16& b07, + v16& b08, v16& b09, v16& b10, v16& b11, v16& b12, v16& b13, v16& b14, + v16& b15 ) ALWAYS_INLINE; + + friend inline void store_16x1_tr( const v16& a, void* a00, void* a01, + void* a02, void* a03, void* a04, + void* a05, void* a06, void* a07, + void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, + void* a14, void* a15 ) ALWAYS_INLINE; + friend inline void store_16x2_tr( + const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, void* ALIGNED( 8 ) a03, + void* ALIGNED( 8 ) a04, void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, void* ALIGNED( 8 ) a09, + void* ALIGNED( 8 ) a10, void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x4_tr( const v16& a, const v16& b, const v16& c, const v16& d, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr_p( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07 ) ALWAYS_INLINE; + friend inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; protected: - - union - { - int i[16]; - float f[16]; + union { + int i[16]; + float f[16]; }; public: + v16() {} // Default constructor - v16() {} // Default constructor - - v16( const v16 &a ) // Copy constructor + v16( const v16& a ) // Copy constructor { - i[ 0]=a.i[ 0]; i[ 1]=a.i[ 1]; i[ 2]=a.i[ 2]; i[ 3]=a.i[ 3]; - i[ 4]=a.i[ 4]; i[ 5]=a.i[ 5]; i[ 6]=a.i[ 6]; i[ 7]=a.i[ 7]; - i[ 8]=a.i[ 8]; i[ 9]=a.i[ 9]; i[10]=a.i[10]; i[11]=a.i[11]; - i[12]=a.i[12]; i[13]=a.i[13]; i[14]=a.i[14]; i[15]=a.i[15]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; + i[8] = a.i[8]; + i[9] = a.i[9]; + i[10] = a.i[10]; + i[11] = a.i[11]; + i[12] = a.i[12]; + i[13] = a.i[13]; + i[14] = a.i[14]; + i[15] = a.i[15]; } - ~v16() {} // Default destructor - }; - - // v16 miscellaneous functions - - inline int any( const v16 &a ) - { - return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] || - a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] || - a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] || - a.i[12] || a.i[13] || a.i[14] || a.i[15]; - } - - inline int all( const v16 &a ) - { - return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] && - a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] && - a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] && - a.i[12] && a.i[13] && a.i[14] && a.i[15]; - } - - template - inline v16 splat( const v16 & a ) - { + ~v16() {} // Default destructor +}; + +// v16 miscellaneous functions + +inline int any( const v16& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7] || a.i[8] || a.i[9] || a.i[10] || a.i[11] || a.i[12] || + a.i[13] || a.i[14] || a.i[15]; +} + +inline int all( const v16& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7] && a.i[8] && a.i[9] && a.i[10] && a.i[11] && a.i[12] && + a.i[13] && a.i[14] && a.i[15]; +} + +template +inline v16 splat( const v16& a ) +{ v16 b; - b.i[ 0] = a.i[n]; - b.i[ 1] = a.i[n]; - b.i[ 2] = a.i[n]; - b.i[ 3] = a.i[n]; - b.i[ 4] = a.i[n]; - b.i[ 5] = a.i[n]; - b.i[ 6] = a.i[n]; - b.i[ 7] = a.i[n]; - b.i[ 8] = a.i[n]; - b.i[ 9] = a.i[n]; + b.i[0] = a.i[n]; + b.i[1] = a.i[n]; + b.i[2] = a.i[n]; + b.i[3] = a.i[n]; + b.i[4] = a.i[n]; + b.i[5] = a.i[n]; + b.i[6] = a.i[n]; + b.i[7] = a.i[n]; + b.i[8] = a.i[n]; + b.i[9] = a.i[n]; b.i[10] = a.i[n]; b.i[11] = a.i[n]; b.i[12] = a.i[n]; @@ -397,23 +307,25 @@ namespace v16 b.i[15] = a.i[n]; return b; - } +} - template - inline v16 shuffle( const v16 & a ) - { +template +inline v16 shuffle( const v16& a ) +{ v16 b; - b.i[ 0] = a.i[i00]; - b.i[ 1] = a.i[i01]; - b.i[ 2] = a.i[i02]; - b.i[ 3] = a.i[i03]; - b.i[ 4] = a.i[i04]; - b.i[ 5] = a.i[i05]; - b.i[ 6] = a.i[i06]; - b.i[ 7] = a.i[i07]; - b.i[ 8] = a.i[i08]; - b.i[ 9] = a.i[i09]; + b.i[0] = a.i[i00]; + b.i[1] = a.i[i01]; + b.i[2] = a.i[i02]; + b.i[3] = a.i[i03]; + b.i[4] = a.i[i04]; + b.i[5] = a.i[i05]; + b.i[6] = a.i[i06]; + b.i[7] = a.i[i07]; + b.i[8] = a.i[i08]; + b.i[9] = a.i[i09]; b.i[10] = a.i[i10]; b.i[11] = a.i[i11]; b.i[12] = a.i[i12]; @@ -422,2939 +334,3046 @@ namespace v16 b.i[15] = a.i[i15]; return b; - } - -# define sw(x,y) x^=y, y^=x, x^=y - - inline void swap( v16 &a, v16 &b ) - { - sw( a.i[ 0], b.i[ 0] ); - sw( a.i[ 1], b.i[ 1] ); - sw( a.i[ 2], b.i[ 2] ); - sw( a.i[ 3], b.i[ 3] ); - sw( a.i[ 4], b.i[ 4] ); - sw( a.i[ 5], b.i[ 5] ); - sw( a.i[ 6], b.i[ 6] ); - sw( a.i[ 7], b.i[ 7] ); - sw( a.i[ 8], b.i[ 8] ); - sw( a.i[ 9], b.i[ 9] ); +} + +#define sw( x, y ) x ^= y, y ^= x, x ^= y + +inline void swap( v16& a, v16& b ) +{ + sw( a.i[0], b.i[0] ); + sw( a.i[1], b.i[1] ); + sw( a.i[2], b.i[2] ); + sw( a.i[3], b.i[3] ); + sw( a.i[4], b.i[4] ); + sw( a.i[5], b.i[5] ); + sw( a.i[6], b.i[6] ); + sw( a.i[7], b.i[7] ); + sw( a.i[8], b.i[8] ); + sw( a.i[9], b.i[9] ); sw( a.i[10], b.i[10] ); sw( a.i[11], b.i[11] ); sw( a.i[12], b.i[12] ); sw( a.i[13], b.i[13] ); sw( a.i[14], b.i[14] ); sw( a.i[15], b.i[15] ); - } - - inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) - { - sw( a00.i[1],a01.i[0] ); sw( a00.i[2],a02.i[0] ); sw( a00.i[3],a03.i[0] ); sw( a00.i[4],a04.i[0] ); sw( a00.i[5],a05.i[0] ); sw( a00.i[6],a06.i[0] ); sw( a00.i[7],a07.i[0] ); sw( a00.i[8],a08.i[0] ); sw( a00.i[9],a09.i[0] ); sw( a00.i[10],a10.i[0] ); sw( a00.i[11],a11.i[ 0] ); sw( a00.i[12],a12.i[ 0] ); sw( a00.i[13],a13.i[ 0] ); sw( a00.i[14],a14.i[ 0] ); sw( a00.i[15],a15.i[ 0] ); - sw( a01.i[2],a02.i[1] ); sw( a01.i[3],a03.i[1] ); sw( a01.i[4],a04.i[1] ); sw( a01.i[5],a05.i[1] ); sw( a01.i[6],a06.i[1] ); sw( a01.i[7],a07.i[1] ); sw( a01.i[8],a08.i[1] ); sw( a01.i[9],a09.i[1] ); sw( a01.i[10],a10.i[1] ); sw( a01.i[11],a11.i[ 1] ); sw( a01.i[12],a12.i[ 1] ); sw( a01.i[13],a13.i[ 1] ); sw( a01.i[14],a14.i[ 1] ); sw( a01.i[15],a15.i[ 1] ); - sw( a02.i[3],a03.i[2] ); sw( a02.i[4],a04.i[2] ); sw( a02.i[5],a05.i[2] ); sw( a02.i[6],a06.i[2] ); sw( a02.i[7],a07.i[2] ); sw( a02.i[8],a08.i[2] ); sw( a02.i[9],a09.i[2] ); sw( a02.i[10],a10.i[2] ); sw( a02.i[11],a11.i[ 2] ); sw( a02.i[12],a12.i[ 2] ); sw( a02.i[13],a13.i[ 2] ); sw( a02.i[14],a14.i[ 2] ); sw( a02.i[15],a15.i[ 2] ); - sw( a03.i[4],a04.i[3] ); sw( a03.i[5],a05.i[3] ); sw( a03.i[6],a06.i[3] ); sw( a03.i[7],a07.i[3] ); sw( a03.i[8],a08.i[3] ); sw( a03.i[9],a09.i[3] ); sw( a03.i[10],a10.i[3] ); sw( a03.i[11],a11.i[ 3] ); sw( a03.i[12],a12.i[ 3] ); sw( a03.i[13],a13.i[ 3] ); sw( a03.i[14],a14.i[ 3] ); sw( a03.i[15],a15.i[ 3] ); - sw( a04.i[5],a05.i[4] ); sw( a04.i[6],a06.i[4] ); sw( a04.i[7],a07.i[4] ); sw( a04.i[8],a08.i[4] ); sw( a04.i[9],a09.i[4] ); sw( a04.i[10],a10.i[4] ); sw( a04.i[11],a11.i[ 4] ); sw( a04.i[12],a12.i[ 4] ); sw( a04.i[13],a13.i[ 4] ); sw( a04.i[14],a14.i[ 4] ); sw( a04.i[15],a15.i[ 4] ); - sw( a05.i[6],a06.i[5] ); sw( a05.i[7],a07.i[5] ); sw( a05.i[8],a08.i[5] ); sw( a05.i[9],a09.i[5] ); sw( a05.i[10],a10.i[5] ); sw( a05.i[11],a11.i[ 5] ); sw( a05.i[12],a12.i[ 5] ); sw( a05.i[13],a13.i[ 5] ); sw( a05.i[14],a14.i[ 5] ); sw( a05.i[15],a15.i[ 5] ); - sw( a06.i[7],a07.i[6] ); sw( a06.i[8],a08.i[6] ); sw( a06.i[9],a09.i[6] ); sw( a06.i[10],a10.i[6] ); sw( a06.i[11],a11.i[ 6] ); sw( a06.i[12],a12.i[ 6] ); sw( a06.i[13],a13.i[ 6] ); sw( a06.i[14],a14.i[ 6] ); sw( a06.i[15],a15.i[ 6] ); - sw( a07.i[8],a08.i[7] ); sw( a07.i[9],a09.i[7] ); sw( a07.i[10],a10.i[7] ); sw( a07.i[11],a11.i[ 7] ); sw( a07.i[12],a12.i[ 7] ); sw( a07.i[13],a13.i[ 7] ); sw( a07.i[14],a14.i[ 7] ); sw( a07.i[15],a15.i[ 7] ); - sw( a08.i[9],a09.i[8] ); sw( a08.i[10],a10.i[8] ); sw( a08.i[11],a11.i[ 8] ); sw( a08.i[12],a12.i[ 8] ); sw( a08.i[13],a13.i[ 8] ); sw( a08.i[14],a14.i[ 8] ); sw( a08.i[15],a15.i[ 8] ); - sw( a09.i[10],a10.i[9] ); sw( a09.i[11],a11.i[ 9] ); sw( a09.i[12],a12.i[ 9] ); sw( a09.i[13],a13.i[ 9] ); sw( a09.i[14],a14.i[ 9] ); sw( a09.i[15],a15.i[ 9] ); - sw( a10.i[11],a11.i[10] ); sw( a10.i[12],a12.i[10] ); sw( a10.i[13],a13.i[10] ); sw( a10.i[14],a14.i[10] ); sw( a10.i[15],a15.i[10] ); - sw( a11.i[12],a12.i[11] ); sw( a11.i[13],a13.i[11] ); sw( a11.i[14],a14.i[11] ); sw( a11.i[15],a15.i[11] ); - sw( a12.i[13],a13.i[12] ); sw( a12.i[14],a14.i[12] ); sw( a12.i[15],a15.i[12] ); - sw( a13.i[14],a14.i[13] ); sw( a13.i[15],a15.i[13] ); - sw( a14.i[15],a15.i[14] ); - } - -# undef sw - - // v16 memory manipulation functions - - inline void load_16x1( const void * ALIGNED(64) p, - v16 &a ) - { - a.i[ 0] = ((const int * ALIGNED(64))p)[ 0]; - a.i[ 1] = ((const int * ALIGNED(64))p)[ 1]; - a.i[ 2] = ((const int * ALIGNED(64))p)[ 2]; - a.i[ 3] = ((const int * ALIGNED(64))p)[ 3]; - a.i[ 4] = ((const int * ALIGNED(64))p)[ 4]; - a.i[ 5] = ((const int * ALIGNED(64))p)[ 5]; - a.i[ 6] = ((const int * ALIGNED(64))p)[ 6]; - a.i[ 7] = ((const int * ALIGNED(64))p)[ 7]; - a.i[ 8] = ((const int * ALIGNED(64))p)[ 8]; - a.i[ 9] = ((const int * ALIGNED(64))p)[ 9]; - a.i[10] = ((const int * ALIGNED(64))p)[10]; - a.i[11] = ((const int * ALIGNED(64))p)[11]; - a.i[12] = ((const int * ALIGNED(64))p)[12]; - a.i[13] = ((const int * ALIGNED(64))p)[13]; - a.i[14] = ((const int * ALIGNED(64))p)[14]; - a.i[15] = ((const int * ALIGNED(64))p)[15]; - } - - inline void store_16x1( const v16 &a, - void * ALIGNED(64) p ) - { - ((int * ALIGNED(64))p)[ 0] = a.i[ 0]; - ((int * ALIGNED(64))p)[ 1] = a.i[ 1]; - ((int * ALIGNED(64))p)[ 2] = a.i[ 2]; - ((int * ALIGNED(64))p)[ 3] = a.i[ 3]; - ((int * ALIGNED(64))p)[ 4] = a.i[ 4]; - ((int * ALIGNED(64))p)[ 5] = a.i[ 5]; - ((int * ALIGNED(64))p)[ 6] = a.i[ 6]; - ((int * ALIGNED(64))p)[ 7] = a.i[ 7]; - ((int * ALIGNED(64))p)[ 8] = a.i[ 8]; - ((int * ALIGNED(64))p)[ 9] = a.i[ 9]; - ((int * ALIGNED(64))p)[10] = a.i[10]; - ((int * ALIGNED(64))p)[11] = a.i[11]; - ((int * ALIGNED(64))p)[12] = a.i[12]; - ((int * ALIGNED(64))p)[13] = a.i[13]; - ((int * ALIGNED(64))p)[14] = a.i[14]; - ((int * ALIGNED(64))p)[15] = a.i[15]; - } - - inline void stream_16x1( const v16 &a, - void * ALIGNED(64) p ) - { - ((int * ALIGNED(64))p)[ 0] = a.i[ 0]; - ((int * ALIGNED(64))p)[ 1] = a.i[ 1]; - ((int * ALIGNED(64))p)[ 2] = a.i[ 2]; - ((int * ALIGNED(64))p)[ 3] = a.i[ 3]; - ((int * ALIGNED(64))p)[ 4] = a.i[ 4]; - ((int * ALIGNED(64))p)[ 5] = a.i[ 5]; - ((int * ALIGNED(64))p)[ 6] = a.i[ 6]; - ((int * ALIGNED(64))p)[ 7] = a.i[ 7]; - ((int * ALIGNED(64))p)[ 8] = a.i[ 8]; - ((int * ALIGNED(64))p)[ 9] = a.i[ 9]; - ((int * ALIGNED(64))p)[10] = a.i[10]; - ((int * ALIGNED(64))p)[11] = a.i[11]; - ((int * ALIGNED(64))p)[12] = a.i[12]; - ((int * ALIGNED(64))p)[13] = a.i[13]; - ((int * ALIGNED(64))p)[14] = a.i[14]; - ((int * ALIGNED(64))p)[15] = a.i[15]; - } - - inline void clear_16x1( void * ALIGNED(64) p ) - { - ((int * ALIGNED(64))p)[ 0] = 0; - ((int * ALIGNED(64))p)[ 1] = 0; - ((int * ALIGNED(64))p)[ 2] = 0; - ((int * ALIGNED(64))p)[ 3] = 0; - ((int * ALIGNED(64))p)[ 4] = 0; - ((int * ALIGNED(64))p)[ 5] = 0; - ((int * ALIGNED(64))p)[ 6] = 0; - ((int * ALIGNED(64))p)[ 7] = 0; - ((int * ALIGNED(64))p)[ 8] = 0; - ((int * ALIGNED(64))p)[ 9] = 0; - ((int * ALIGNED(64))p)[10] = 0; - ((int * ALIGNED(64))p)[11] = 0; - ((int * ALIGNED(64))p)[12] = 0; - ((int * ALIGNED(64))p)[13] = 0; - ((int * ALIGNED(64))p)[14] = 0; - ((int * ALIGNED(64))p)[15] = 0; - } - - // FIXME: Ordering semantics - inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) - { - ((int * ALIGNED(64))dst)[ 0] = ((const int * ALIGNED(64))src)[ 0]; - ((int * ALIGNED(64))dst)[ 1] = ((const int * ALIGNED(64))src)[ 1]; - ((int * ALIGNED(64))dst)[ 2] = ((const int * ALIGNED(64))src)[ 2]; - ((int * ALIGNED(64))dst)[ 3] = ((const int * ALIGNED(64))src)[ 3]; - ((int * ALIGNED(64))dst)[ 4] = ((const int * ALIGNED(64))src)[ 4]; - ((int * ALIGNED(64))dst)[ 5] = ((const int * ALIGNED(64))src)[ 5]; - ((int * ALIGNED(64))dst)[ 6] = ((const int * ALIGNED(64))src)[ 6]; - ((int * ALIGNED(64))dst)[ 7] = ((const int * ALIGNED(64))src)[ 7]; - ((int * ALIGNED(64))dst)[ 8] = ((const int * ALIGNED(64))src)[ 8]; - ((int * ALIGNED(64))dst)[ 9] = ((const int * ALIGNED(64))src)[ 9]; - ((int * ALIGNED(64))dst)[10] = ((const int * ALIGNED(64))src)[10]; - ((int * ALIGNED(64))dst)[11] = ((const int * ALIGNED(64))src)[11]; - ((int * ALIGNED(64))dst)[12] = ((const int * ALIGNED(64))src)[12]; - ((int * ALIGNED(64))dst)[13] = ((const int * ALIGNED(64))src)[13]; - ((int * ALIGNED(64))dst)[14] = ((const int * ALIGNED(64))src)[14]; - ((int * ALIGNED(64))dst)[15] = ((const int * ALIGNED(64))src)[15]; - } - - inline void swap_16x1( void * ALIGNED(64) a, - void * ALIGNED(64) b ) - { +} + +inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, v16& a04, + v16& a05, v16& a06, v16& a07, v16& a08, v16& a09, + v16& a10, v16& a11, v16& a12, v16& a13, v16& a14, + v16& a15 ) +{ + sw( a00.i[1], a01.i[0] ); + sw( a00.i[2], a02.i[0] ); + sw( a00.i[3], a03.i[0] ); + sw( a00.i[4], a04.i[0] ); + sw( a00.i[5], a05.i[0] ); + sw( a00.i[6], a06.i[0] ); + sw( a00.i[7], a07.i[0] ); + sw( a00.i[8], a08.i[0] ); + sw( a00.i[9], a09.i[0] ); + sw( a00.i[10], a10.i[0] ); + sw( a00.i[11], a11.i[0] ); + sw( a00.i[12], a12.i[0] ); + sw( a00.i[13], a13.i[0] ); + sw( a00.i[14], a14.i[0] ); + sw( a00.i[15], a15.i[0] ); + sw( a01.i[2], a02.i[1] ); + sw( a01.i[3], a03.i[1] ); + sw( a01.i[4], a04.i[1] ); + sw( a01.i[5], a05.i[1] ); + sw( a01.i[6], a06.i[1] ); + sw( a01.i[7], a07.i[1] ); + sw( a01.i[8], a08.i[1] ); + sw( a01.i[9], a09.i[1] ); + sw( a01.i[10], a10.i[1] ); + sw( a01.i[11], a11.i[1] ); + sw( a01.i[12], a12.i[1] ); + sw( a01.i[13], a13.i[1] ); + sw( a01.i[14], a14.i[1] ); + sw( a01.i[15], a15.i[1] ); + sw( a02.i[3], a03.i[2] ); + sw( a02.i[4], a04.i[2] ); + sw( a02.i[5], a05.i[2] ); + sw( a02.i[6], a06.i[2] ); + sw( a02.i[7], a07.i[2] ); + sw( a02.i[8], a08.i[2] ); + sw( a02.i[9], a09.i[2] ); + sw( a02.i[10], a10.i[2] ); + sw( a02.i[11], a11.i[2] ); + sw( a02.i[12], a12.i[2] ); + sw( a02.i[13], a13.i[2] ); + sw( a02.i[14], a14.i[2] ); + sw( a02.i[15], a15.i[2] ); + sw( a03.i[4], a04.i[3] ); + sw( a03.i[5], a05.i[3] ); + sw( a03.i[6], a06.i[3] ); + sw( a03.i[7], a07.i[3] ); + sw( a03.i[8], a08.i[3] ); + sw( a03.i[9], a09.i[3] ); + sw( a03.i[10], a10.i[3] ); + sw( a03.i[11], a11.i[3] ); + sw( a03.i[12], a12.i[3] ); + sw( a03.i[13], a13.i[3] ); + sw( a03.i[14], a14.i[3] ); + sw( a03.i[15], a15.i[3] ); + sw( a04.i[5], a05.i[4] ); + sw( a04.i[6], a06.i[4] ); + sw( a04.i[7], a07.i[4] ); + sw( a04.i[8], a08.i[4] ); + sw( a04.i[9], a09.i[4] ); + sw( a04.i[10], a10.i[4] ); + sw( a04.i[11], a11.i[4] ); + sw( a04.i[12], a12.i[4] ); + sw( a04.i[13], a13.i[4] ); + sw( a04.i[14], a14.i[4] ); + sw( a04.i[15], a15.i[4] ); + sw( a05.i[6], a06.i[5] ); + sw( a05.i[7], a07.i[5] ); + sw( a05.i[8], a08.i[5] ); + sw( a05.i[9], a09.i[5] ); + sw( a05.i[10], a10.i[5] ); + sw( a05.i[11], a11.i[5] ); + sw( a05.i[12], a12.i[5] ); + sw( a05.i[13], a13.i[5] ); + sw( a05.i[14], a14.i[5] ); + sw( a05.i[15], a15.i[5] ); + sw( a06.i[7], a07.i[6] ); + sw( a06.i[8], a08.i[6] ); + sw( a06.i[9], a09.i[6] ); + sw( a06.i[10], a10.i[6] ); + sw( a06.i[11], a11.i[6] ); + sw( a06.i[12], a12.i[6] ); + sw( a06.i[13], a13.i[6] ); + sw( a06.i[14], a14.i[6] ); + sw( a06.i[15], a15.i[6] ); + sw( a07.i[8], a08.i[7] ); + sw( a07.i[9], a09.i[7] ); + sw( a07.i[10], a10.i[7] ); + sw( a07.i[11], a11.i[7] ); + sw( a07.i[12], a12.i[7] ); + sw( a07.i[13], a13.i[7] ); + sw( a07.i[14], a14.i[7] ); + sw( a07.i[15], a15.i[7] ); + sw( a08.i[9], a09.i[8] ); + sw( a08.i[10], a10.i[8] ); + sw( a08.i[11], a11.i[8] ); + sw( a08.i[12], a12.i[8] ); + sw( a08.i[13], a13.i[8] ); + sw( a08.i[14], a14.i[8] ); + sw( a08.i[15], a15.i[8] ); + sw( a09.i[10], a10.i[9] ); + sw( a09.i[11], a11.i[9] ); + sw( a09.i[12], a12.i[9] ); + sw( a09.i[13], a13.i[9] ); + sw( a09.i[14], a14.i[9] ); + sw( a09.i[15], a15.i[9] ); + sw( a10.i[11], a11.i[10] ); + sw( a10.i[12], a12.i[10] ); + sw( a10.i[13], a13.i[10] ); + sw( a10.i[14], a14.i[10] ); + sw( a10.i[15], a15.i[10] ); + sw( a11.i[12], a12.i[11] ); + sw( a11.i[13], a13.i[11] ); + sw( a11.i[14], a14.i[11] ); + sw( a11.i[15], a15.i[11] ); + sw( a12.i[13], a13.i[12] ); + sw( a12.i[14], a14.i[12] ); + sw( a12.i[15], a15.i[12] ); + sw( a13.i[14], a14.i[13] ); + sw( a13.i[15], a15.i[13] ); + sw( a14.i[15], a15.i[14] ); +} + +#undef sw + +// v16 memory manipulation functions + +inline void load_16x1( const void* ALIGNED( 64 ) p, v16& a ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 64 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 64 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 64 ))p )[3]; + a.i[4] = ( (const int* ALIGNED( 64 ))p )[4]; + a.i[5] = ( (const int* ALIGNED( 64 ))p )[5]; + a.i[6] = ( (const int* ALIGNED( 64 ))p )[6]; + a.i[7] = ( (const int* ALIGNED( 64 ))p )[7]; + a.i[8] = ( (const int* ALIGNED( 64 ))p )[8]; + a.i[9] = ( (const int* ALIGNED( 64 ))p )[9]; + a.i[10] = ( (const int* ALIGNED( 64 ))p )[10]; + a.i[11] = ( (const int* ALIGNED( 64 ))p )[11]; + a.i[12] = ( (const int* ALIGNED( 64 ))p )[12]; + a.i[13] = ( (const int* ALIGNED( 64 ))p )[13]; + a.i[14] = ( (const int* ALIGNED( 64 ))p )[14]; + a.i[15] = ( (const int* ALIGNED( 64 ))p )[15]; +} + +inline void store_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ + ( (int* ALIGNED( 64 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 64 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 64 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 64 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 64 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 64 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 64 ))p )[7] = a.i[7]; + ( (int* ALIGNED( 64 ))p )[8] = a.i[8]; + ( (int* ALIGNED( 64 ))p )[9] = a.i[9]; + ( (int* ALIGNED( 64 ))p )[10] = a.i[10]; + ( (int* ALIGNED( 64 ))p )[11] = a.i[11]; + ( (int* ALIGNED( 64 ))p )[12] = a.i[12]; + ( (int* ALIGNED( 64 ))p )[13] = a.i[13]; + ( (int* ALIGNED( 64 ))p )[14] = a.i[14]; + ( (int* ALIGNED( 64 ))p )[15] = a.i[15]; +} + +inline void stream_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ + ( (int* ALIGNED( 64 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 64 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 64 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 64 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 64 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 64 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 64 ))p )[7] = a.i[7]; + ( (int* ALIGNED( 64 ))p )[8] = a.i[8]; + ( (int* ALIGNED( 64 ))p )[9] = a.i[9]; + ( (int* ALIGNED( 64 ))p )[10] = a.i[10]; + ( (int* ALIGNED( 64 ))p )[11] = a.i[11]; + ( (int* ALIGNED( 64 ))p )[12] = a.i[12]; + ( (int* ALIGNED( 64 ))p )[13] = a.i[13]; + ( (int* ALIGNED( 64 ))p )[14] = a.i[14]; + ( (int* ALIGNED( 64 ))p )[15] = a.i[15]; +} + +inline void clear_16x1( void* ALIGNED( 64 ) p ) +{ + ( (int* ALIGNED( 64 ))p )[0] = 0; + ( (int* ALIGNED( 64 ))p )[1] = 0; + ( (int* ALIGNED( 64 ))p )[2] = 0; + ( (int* ALIGNED( 64 ))p )[3] = 0; + ( (int* ALIGNED( 64 ))p )[4] = 0; + ( (int* ALIGNED( 64 ))p )[5] = 0; + ( (int* ALIGNED( 64 ))p )[6] = 0; + ( (int* ALIGNED( 64 ))p )[7] = 0; + ( (int* ALIGNED( 64 ))p )[8] = 0; + ( (int* ALIGNED( 64 ))p )[9] = 0; + ( (int* ALIGNED( 64 ))p )[10] = 0; + ( (int* ALIGNED( 64 ))p )[11] = 0; + ( (int* ALIGNED( 64 ))p )[12] = 0; + ( (int* ALIGNED( 64 ))p )[13] = 0; + ( (int* ALIGNED( 64 ))p )[14] = 0; + ( (int* ALIGNED( 64 ))p )[15] = 0; +} + +// FIXME: Ordering semantics +inline void copy_16x1( void* ALIGNED( 64 ) dst, const void* ALIGNED( 64 ) src ) +{ + ( (int* ALIGNED( 64 ))dst )[0] = ( (const int* ALIGNED( 64 ))src )[0]; + ( (int* ALIGNED( 64 ))dst )[1] = ( (const int* ALIGNED( 64 ))src )[1]; + ( (int* ALIGNED( 64 ))dst )[2] = ( (const int* ALIGNED( 64 ))src )[2]; + ( (int* ALIGNED( 64 ))dst )[3] = ( (const int* ALIGNED( 64 ))src )[3]; + ( (int* ALIGNED( 64 ))dst )[4] = ( (const int* ALIGNED( 64 ))src )[4]; + ( (int* ALIGNED( 64 ))dst )[5] = ( (const int* ALIGNED( 64 ))src )[5]; + ( (int* ALIGNED( 64 ))dst )[6] = ( (const int* ALIGNED( 64 ))src )[6]; + ( (int* ALIGNED( 64 ))dst )[7] = ( (const int* ALIGNED( 64 ))src )[7]; + ( (int* ALIGNED( 64 ))dst )[8] = ( (const int* ALIGNED( 64 ))src )[8]; + ( (int* ALIGNED( 64 ))dst )[9] = ( (const int* ALIGNED( 64 ))src )[9]; + ( (int* ALIGNED( 64 ))dst )[10] = ( (const int* ALIGNED( 64 ))src )[10]; + ( (int* ALIGNED( 64 ))dst )[11] = ( (const int* ALIGNED( 64 ))src )[11]; + ( (int* ALIGNED( 64 ))dst )[12] = ( (const int* ALIGNED( 64 ))src )[12]; + ( (int* ALIGNED( 64 ))dst )[13] = ( (const int* ALIGNED( 64 ))src )[13]; + ( (int* ALIGNED( 64 ))dst )[14] = ( (const int* ALIGNED( 64 ))src )[14]; + ( (int* ALIGNED( 64 ))dst )[15] = ( (const int* ALIGNED( 64 ))src )[15]; +} + +inline void swap_16x1( void* ALIGNED( 64 ) a, void* ALIGNED( 64 ) b ) +{ int t; - t = ((int * ALIGNED(64))a)[ 0]; - ((int * ALIGNED(64))a)[ 0] = ((int * ALIGNED(64))b)[ 0]; - ((int * ALIGNED(64))b)[ 0] = t; - - t = ((int * ALIGNED(64))a)[ 1]; - ((int * ALIGNED(64))a)[ 1] = ((int * ALIGNED(64))b)[ 1]; - ((int * ALIGNED(64))b)[ 1] = t; - - t = ((int * ALIGNED(64))a)[ 2]; - ((int * ALIGNED(64))a)[ 2] = ((int * ALIGNED(64))b)[ 2]; - ((int * ALIGNED(64))b)[ 2] = t; - - t = ((int * ALIGNED(64))a)[ 3]; - ((int * ALIGNED(64))a)[ 3] = ((int * ALIGNED(64))b)[ 3]; - ((int * ALIGNED(64))b)[ 3] = t; - - t = ((int * ALIGNED(64))a)[ 4]; - ((int * ALIGNED(64))a)[ 4] = ((int * ALIGNED(64))b)[ 4]; - ((int * ALIGNED(64))b)[ 4] = t; - - t = ((int * ALIGNED(64))a)[ 5]; - ((int * ALIGNED(64))a)[ 5] = ((int * ALIGNED(64))b)[ 5]; - ((int * ALIGNED(64))b)[ 5] = t; - - t = ((int * ALIGNED(64))a)[ 6]; - ((int * ALIGNED(64))a)[ 6] = ((int * ALIGNED(64))b)[ 6]; - ((int * ALIGNED(64))b)[ 6] = t; - - t = ((int * ALIGNED(64))a)[ 7]; - ((int * ALIGNED(64))a)[ 7] = ((int * ALIGNED(64))b)[ 7]; - ((int * ALIGNED(64))b)[ 7] = t; - - t = ((int * ALIGNED(64))a)[ 8]; - ((int * ALIGNED(64))a)[ 8] = ((int * ALIGNED(64))b)[ 8]; - ((int * ALIGNED(64))b)[ 8] = t; - - t = ((int * ALIGNED(64))a)[ 9]; - ((int * ALIGNED(64))a)[ 9] = ((int * ALIGNED(64))b)[ 9]; - ((int * ALIGNED(64))b)[ 9] = t; - - t = ((int * ALIGNED(64))a)[10]; - ((int * ALIGNED(64))a)[10] = ((int * ALIGNED(64))b)[10]; - ((int * ALIGNED(64))b)[10] = t; - - t = ((int * ALIGNED(64))a)[11]; - ((int * ALIGNED(64))a)[11] = ((int * ALIGNED(64))b)[11]; - ((int * ALIGNED(64))b)[11] = t; - - t = ((int * ALIGNED(64))a)[12]; - ((int * ALIGNED(64))a)[12] = ((int * ALIGNED(64))b)[12]; - ((int * ALIGNED(64))b)[12] = t; - - t = ((int * ALIGNED(64))a)[13]; - ((int * ALIGNED(64))a)[13] = ((int * ALIGNED(64))b)[13]; - ((int * ALIGNED(64))b)[13] = t; - - t = ((int * ALIGNED(64))a)[14]; - ((int * ALIGNED(64))a)[14] = ((int * ALIGNED(64))b)[14]; - ((int * ALIGNED(64))b)[14] = t; - - t = ((int * ALIGNED(64))a)[15]; - ((int * ALIGNED(64))a)[15] = ((int * ALIGNED(64))b)[15]; - ((int * ALIGNED(64))b)[15] = t; - } - - // v16 transposed memory manipulation functions - - inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) - { - a.i[ 0] = ((const int *)a00)[0]; - a.i[ 1] = ((const int *)a01)[0]; - a.i[ 2] = ((const int *)a02)[0]; - a.i[ 3] = ((const int *)a03)[0]; - a.i[ 4] = ((const int *)a04)[0]; - a.i[ 5] = ((const int *)a05)[0]; - a.i[ 6] = ((const int *)a06)[0]; - a.i[ 7] = ((const int *)a07)[0]; - a.i[ 8] = ((const int *)a08)[0]; - a.i[ 9] = ((const int *)a09)[0]; - a.i[10] = ((const int *)a10)[0]; - a.i[11] = ((const int *)a11)[0]; - a.i[12] = ((const int *)a12)[0]; - a.i[13] = ((const int *)a13)[0]; - a.i[14] = ((const int *)a14)[0]; - a.i[15] = ((const int *)a15)[0]; - } - - inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) - { - a.i[ 0] = ((const int * ALIGNED(8))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(8))a00)[1]; - - a.i[ 1] = ((const int * ALIGNED(8))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(8))a01)[1]; - - a.i[ 2] = ((const int * ALIGNED(8))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(8))a02)[1]; - - a.i[ 3] = ((const int * ALIGNED(8))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(8))a03)[1]; - - a.i[ 4] = ((const int * ALIGNED(8))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(8))a04)[1]; - - a.i[ 5] = ((const int * ALIGNED(8))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(8))a05)[1]; - - a.i[ 6] = ((const int * ALIGNED(8))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(8))a06)[1]; - - a.i[ 7] = ((const int * ALIGNED(8))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(8))a07)[1]; - - a.i[ 8] = ((const int * ALIGNED(8))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(8))a08)[1]; - - a.i[ 9] = ((const int * ALIGNED(8))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(8))a09)[1]; - - a.i[10] = ((const int * ALIGNED(8))a10)[0]; - b.i[10] = ((const int * ALIGNED(8))a10)[1]; - - a.i[11] = ((const int * ALIGNED(8))a11)[0]; - b.i[11] = ((const int * ALIGNED(8))a11)[1]; - - a.i[12] = ((const int * ALIGNED(8))a12)[0]; - b.i[12] = ((const int * ALIGNED(8))a12)[1]; - - a.i[13] = ((const int * ALIGNED(8))a13)[0]; - b.i[13] = ((const int * ALIGNED(8))a13)[1]; - - a.i[14] = ((const int * ALIGNED(8))a14)[0]; - b.i[14] = ((const int * ALIGNED(8))a14)[1]; - - a.i[15] = ((const int * ALIGNED(8))a15)[0]; - b.i[15] = ((const int * ALIGNED(8))a15)[1]; - } - - inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - } - - inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - d.i[ 0] = ((const int * ALIGNED(64))a00)[3]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - d.i[ 1] = ((const int * ALIGNED(64))a01)[3]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - d.i[ 2] = ((const int * ALIGNED(64))a02)[3]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - d.i[ 3] = ((const int * ALIGNED(64))a03)[3]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - d.i[ 4] = ((const int * ALIGNED(64))a04)[3]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - d.i[ 5] = ((const int * ALIGNED(64))a05)[3]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - d.i[ 6] = ((const int * ALIGNED(64))a06)[3]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - d.i[ 7] = ((const int * ALIGNED(64))a07)[3]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - d.i[ 8] = ((const int * ALIGNED(64))a08)[3]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - d.i[ 9] = ((const int * ALIGNED(64))a09)[3]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - d.i[10] = ((const int * ALIGNED(64))a10)[3]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - d.i[11] = ((const int * ALIGNED(64))a11)[3]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - d.i[12] = ((const int * ALIGNED(64))a12)[3]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - d.i[13] = ((const int * ALIGNED(64))a13)[3]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - d.i[14] = ((const int * ALIGNED(64))a14)[3]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - d.i[15] = ((const int * ALIGNED(64))a15)[3]; - } - - inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - d.i[ 0] = ((const int * ALIGNED(64))a00)[3]; - e.i[ 0] = ((const int * ALIGNED(64))a00)[4]; - f.i[ 0] = ((const int * ALIGNED(64))a00)[5]; - g.i[ 0] = ((const int * ALIGNED(64))a00)[6]; - h.i[ 0] = ((const int * ALIGNED(64))a00)[7]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - d.i[ 1] = ((const int * ALIGNED(64))a01)[3]; - e.i[ 1] = ((const int * ALIGNED(64))a01)[4]; - f.i[ 1] = ((const int * ALIGNED(64))a01)[5]; - g.i[ 1] = ((const int * ALIGNED(64))a01)[6]; - h.i[ 1] = ((const int * ALIGNED(64))a01)[7]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - d.i[ 2] = ((const int * ALIGNED(64))a02)[3]; - e.i[ 2] = ((const int * ALIGNED(64))a02)[4]; - f.i[ 2] = ((const int * ALIGNED(64))a02)[5]; - g.i[ 2] = ((const int * ALIGNED(64))a02)[6]; - h.i[ 2] = ((const int * ALIGNED(64))a02)[7]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - d.i[ 3] = ((const int * ALIGNED(64))a03)[3]; - e.i[ 3] = ((const int * ALIGNED(64))a03)[4]; - f.i[ 3] = ((const int * ALIGNED(64))a03)[5]; - g.i[ 3] = ((const int * ALIGNED(64))a03)[6]; - h.i[ 3] = ((const int * ALIGNED(64))a03)[7]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - d.i[ 4] = ((const int * ALIGNED(64))a04)[3]; - e.i[ 4] = ((const int * ALIGNED(64))a04)[4]; - f.i[ 4] = ((const int * ALIGNED(64))a04)[5]; - g.i[ 4] = ((const int * ALIGNED(64))a04)[6]; - h.i[ 4] = ((const int * ALIGNED(64))a04)[7]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - d.i[ 5] = ((const int * ALIGNED(64))a05)[3]; - e.i[ 5] = ((const int * ALIGNED(64))a05)[4]; - f.i[ 5] = ((const int * ALIGNED(64))a05)[5]; - g.i[ 5] = ((const int * ALIGNED(64))a05)[6]; - h.i[ 5] = ((const int * ALIGNED(64))a05)[7]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - d.i[ 6] = ((const int * ALIGNED(64))a06)[3]; - e.i[ 6] = ((const int * ALIGNED(64))a06)[4]; - f.i[ 6] = ((const int * ALIGNED(64))a06)[5]; - g.i[ 6] = ((const int * ALIGNED(64))a06)[6]; - h.i[ 6] = ((const int * ALIGNED(64))a06)[7]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - d.i[ 7] = ((const int * ALIGNED(64))a07)[3]; - e.i[ 7] = ((const int * ALIGNED(64))a07)[4]; - f.i[ 7] = ((const int * ALIGNED(64))a07)[5]; - g.i[ 7] = ((const int * ALIGNED(64))a07)[6]; - h.i[ 7] = ((const int * ALIGNED(64))a07)[7]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - d.i[ 8] = ((const int * ALIGNED(64))a08)[3]; - e.i[ 8] = ((const int * ALIGNED(64))a08)[4]; - f.i[ 8] = ((const int * ALIGNED(64))a08)[5]; - g.i[ 8] = ((const int * ALIGNED(64))a08)[6]; - h.i[ 8] = ((const int * ALIGNED(64))a08)[7]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - d.i[ 9] = ((const int * ALIGNED(64))a09)[3]; - e.i[ 9] = ((const int * ALIGNED(64))a09)[4]; - f.i[ 9] = ((const int * ALIGNED(64))a09)[5]; - g.i[ 9] = ((const int * ALIGNED(64))a09)[6]; - h.i[ 9] = ((const int * ALIGNED(64))a09)[7]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - d.i[10] = ((const int * ALIGNED(64))a10)[3]; - e.i[10] = ((const int * ALIGNED(64))a10)[4]; - f.i[10] = ((const int * ALIGNED(64))a10)[5]; - g.i[10] = ((const int * ALIGNED(64))a10)[6]; - h.i[10] = ((const int * ALIGNED(64))a10)[7]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - d.i[11] = ((const int * ALIGNED(64))a11)[3]; - e.i[11] = ((const int * ALIGNED(64))a11)[4]; - f.i[11] = ((const int * ALIGNED(64))a11)[5]; - g.i[11] = ((const int * ALIGNED(64))a11)[6]; - h.i[11] = ((const int * ALIGNED(64))a11)[7]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - d.i[12] = ((const int * ALIGNED(64))a12)[3]; - e.i[12] = ((const int * ALIGNED(64))a12)[4]; - f.i[12] = ((const int * ALIGNED(64))a12)[5]; - g.i[12] = ((const int * ALIGNED(64))a12)[6]; - h.i[12] = ((const int * ALIGNED(64))a12)[7]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - d.i[13] = ((const int * ALIGNED(64))a13)[3]; - e.i[13] = ((const int * ALIGNED(64))a13)[4]; - f.i[13] = ((const int * ALIGNED(64))a13)[5]; - g.i[13] = ((const int * ALIGNED(64))a13)[6]; - h.i[13] = ((const int * ALIGNED(64))a13)[7]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - d.i[14] = ((const int * ALIGNED(64))a14)[3]; - e.i[14] = ((const int * ALIGNED(64))a14)[4]; - f.i[14] = ((const int * ALIGNED(64))a14)[5]; - g.i[14] = ((const int * ALIGNED(64))a14)[6]; - h.i[14] = ((const int * ALIGNED(64))a14)[7]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - d.i[15] = ((const int * ALIGNED(64))a15)[3]; - e.i[15] = ((const int * ALIGNED(64))a15)[4]; - f.i[15] = ((const int * ALIGNED(64))a15)[5]; - g.i[15] = ((const int * ALIGNED(64))a15)[6]; - h.i[15] = ((const int * ALIGNED(64))a15)[7]; - } - - inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b08.i[ 0] = ((const int * ALIGNED(64))a00)[ 8]; - b09.i[ 0] = ((const int * ALIGNED(64))a00)[ 9]; - b10.i[ 0] = ((const int * ALIGNED(64))a00)[10]; - b11.i[ 0] = ((const int * ALIGNED(64))a00)[11]; - b12.i[ 0] = ((const int * ALIGNED(64))a00)[12]; - b13.i[ 0] = ((const int * ALIGNED(64))a00)[13]; - b14.i[ 0] = ((const int * ALIGNED(64))a00)[14]; - b15.i[ 0] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 1] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 1] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 1] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 1] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 1] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 1] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 1] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 1] = ((const int * ALIGNED(64))a01)[ 7]; - b08.i[ 1] = ((const int * ALIGNED(64))a01)[ 8]; - b09.i[ 1] = ((const int * ALIGNED(64))a01)[ 9]; - b10.i[ 1] = ((const int * ALIGNED(64))a01)[10]; - b11.i[ 1] = ((const int * ALIGNED(64))a01)[11]; - b12.i[ 1] = ((const int * ALIGNED(64))a01)[12]; - b13.i[ 1] = ((const int * ALIGNED(64))a01)[13]; - b14.i[ 1] = ((const int * ALIGNED(64))a01)[14]; - b15.i[ 1] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a02)[ 7]; - b08.i[ 2] = ((const int * ALIGNED(64))a02)[ 8]; - b09.i[ 2] = ((const int * ALIGNED(64))a02)[ 9]; - b10.i[ 2] = ((const int * ALIGNED(64))a02)[10]; - b11.i[ 2] = ((const int * ALIGNED(64))a02)[11]; - b12.i[ 2] = ((const int * ALIGNED(64))a02)[12]; - b13.i[ 2] = ((const int * ALIGNED(64))a02)[13]; - b14.i[ 2] = ((const int * ALIGNED(64))a02)[14]; - b15.i[ 2] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 3] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 3] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 3] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 3] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 3] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 3] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 3] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 3] = ((const int * ALIGNED(64))a03)[ 7]; - b08.i[ 3] = ((const int * ALIGNED(64))a03)[ 8]; - b09.i[ 3] = ((const int * ALIGNED(64))a03)[ 9]; - b10.i[ 3] = ((const int * ALIGNED(64))a03)[10]; - b11.i[ 3] = ((const int * ALIGNED(64))a03)[11]; - b12.i[ 3] = ((const int * ALIGNED(64))a03)[12]; - b13.i[ 3] = ((const int * ALIGNED(64))a03)[13]; - b14.i[ 3] = ((const int * ALIGNED(64))a03)[14]; - b15.i[ 3] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a04)[ 7]; - b08.i[ 4] = ((const int * ALIGNED(64))a04)[ 8]; - b09.i[ 4] = ((const int * ALIGNED(64))a04)[ 9]; - b10.i[ 4] = ((const int * ALIGNED(64))a04)[10]; - b11.i[ 4] = ((const int * ALIGNED(64))a04)[11]; - b12.i[ 4] = ((const int * ALIGNED(64))a04)[12]; - b13.i[ 4] = ((const int * ALIGNED(64))a04)[13]; - b14.i[ 4] = ((const int * ALIGNED(64))a04)[14]; - b15.i[ 4] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[ 5] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[ 5] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[ 5] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[ 5] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[ 5] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[ 5] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[ 5] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[ 5] = ((const int * ALIGNED(64))a05)[ 7]; - b08.i[ 5] = ((const int * ALIGNED(64))a05)[ 8]; - b09.i[ 5] = ((const int * ALIGNED(64))a05)[ 9]; - b10.i[ 5] = ((const int * ALIGNED(64))a05)[10]; - b11.i[ 5] = ((const int * ALIGNED(64))a05)[11]; - b12.i[ 5] = ((const int * ALIGNED(64))a05)[12]; - b13.i[ 5] = ((const int * ALIGNED(64))a05)[13]; - b14.i[ 5] = ((const int * ALIGNED(64))a05)[14]; - b15.i[ 5] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a06)[ 7]; - b08.i[ 6] = ((const int * ALIGNED(64))a06)[ 8]; - b09.i[ 6] = ((const int * ALIGNED(64))a06)[ 9]; - b10.i[ 6] = ((const int * ALIGNED(64))a06)[10]; - b11.i[ 6] = ((const int * ALIGNED(64))a06)[11]; - b12.i[ 6] = ((const int * ALIGNED(64))a06)[12]; - b13.i[ 6] = ((const int * ALIGNED(64))a06)[13]; - b14.i[ 6] = ((const int * ALIGNED(64))a06)[14]; - b15.i[ 6] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[ 7] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[ 7] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[ 7] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[ 7] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[ 7] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[ 7] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[ 7] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[ 7] = ((const int * ALIGNED(64))a07)[ 7]; - b08.i[ 7] = ((const int * ALIGNED(64))a07)[ 8]; - b09.i[ 7] = ((const int * ALIGNED(64))a07)[ 9]; - b10.i[ 7] = ((const int * ALIGNED(64))a07)[10]; - b11.i[ 7] = ((const int * ALIGNED(64))a07)[11]; - b12.i[ 7] = ((const int * ALIGNED(64))a07)[12]; - b13.i[ 7] = ((const int * ALIGNED(64))a07)[13]; - b14.i[ 7] = ((const int * ALIGNED(64))a07)[14]; - b15.i[ 7] = ((const int * ALIGNED(64))a07)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a08)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a08)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a08)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a08)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a08)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a08)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a08)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a08)[ 7]; - b08.i[ 8] = ((const int * ALIGNED(64))a08)[ 8]; - b09.i[ 8] = ((const int * ALIGNED(64))a08)[ 9]; - b10.i[ 8] = ((const int * ALIGNED(64))a08)[10]; - b11.i[ 8] = ((const int * ALIGNED(64))a08)[11]; - b12.i[ 8] = ((const int * ALIGNED(64))a08)[12]; - b13.i[ 8] = ((const int * ALIGNED(64))a08)[13]; - b14.i[ 8] = ((const int * ALIGNED(64))a08)[14]; - b15.i[ 8] = ((const int * ALIGNED(64))a08)[15]; - - b00.i[ 9] = ((const int * ALIGNED(64))a09)[ 0]; - b01.i[ 9] = ((const int * ALIGNED(64))a09)[ 1]; - b02.i[ 9] = ((const int * ALIGNED(64))a09)[ 2]; - b03.i[ 9] = ((const int * ALIGNED(64))a09)[ 3]; - b04.i[ 9] = ((const int * ALIGNED(64))a09)[ 4]; - b05.i[ 9] = ((const int * ALIGNED(64))a09)[ 5]; - b06.i[ 9] = ((const int * ALIGNED(64))a09)[ 6]; - b07.i[ 9] = ((const int * ALIGNED(64))a09)[ 7]; - b08.i[ 9] = ((const int * ALIGNED(64))a09)[ 8]; - b09.i[ 9] = ((const int * ALIGNED(64))a09)[ 9]; - b10.i[ 9] = ((const int * ALIGNED(64))a09)[10]; - b11.i[ 9] = ((const int * ALIGNED(64))a09)[11]; - b12.i[ 9] = ((const int * ALIGNED(64))a09)[12]; - b13.i[ 9] = ((const int * ALIGNED(64))a09)[13]; - b14.i[ 9] = ((const int * ALIGNED(64))a09)[14]; - b15.i[ 9] = ((const int * ALIGNED(64))a09)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a10)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a10)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a10)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a10)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a10)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a10)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a10)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a10)[ 7]; - b08.i[10] = ((const int * ALIGNED(64))a10)[ 8]; - b09.i[10] = ((const int * ALIGNED(64))a10)[ 9]; - b10.i[10] = ((const int * ALIGNED(64))a10)[10]; - b11.i[10] = ((const int * ALIGNED(64))a10)[11]; - b12.i[10] = ((const int * ALIGNED(64))a10)[12]; - b13.i[10] = ((const int * ALIGNED(64))a10)[13]; - b14.i[10] = ((const int * ALIGNED(64))a10)[14]; - b15.i[10] = ((const int * ALIGNED(64))a10)[15]; - - b00.i[11] = ((const int * ALIGNED(64))a11)[ 0]; - b01.i[11] = ((const int * ALIGNED(64))a11)[ 1]; - b02.i[11] = ((const int * ALIGNED(64))a11)[ 2]; - b03.i[11] = ((const int * ALIGNED(64))a11)[ 3]; - b04.i[11] = ((const int * ALIGNED(64))a11)[ 4]; - b05.i[11] = ((const int * ALIGNED(64))a11)[ 5]; - b06.i[11] = ((const int * ALIGNED(64))a11)[ 6]; - b07.i[11] = ((const int * ALIGNED(64))a11)[ 7]; - b08.i[11] = ((const int * ALIGNED(64))a11)[ 8]; - b09.i[11] = ((const int * ALIGNED(64))a11)[ 9]; - b10.i[11] = ((const int * ALIGNED(64))a11)[10]; - b11.i[11] = ((const int * ALIGNED(64))a11)[11]; - b12.i[11] = ((const int * ALIGNED(64))a11)[12]; - b13.i[11] = ((const int * ALIGNED(64))a11)[13]; - b14.i[11] = ((const int * ALIGNED(64))a11)[14]; - b15.i[11] = ((const int * ALIGNED(64))a11)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a12)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a12)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a12)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a12)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a12)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a12)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a12)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a12)[ 7]; - b08.i[12] = ((const int * ALIGNED(64))a12)[ 8]; - b09.i[12] = ((const int * ALIGNED(64))a12)[ 9]; - b10.i[12] = ((const int * ALIGNED(64))a12)[10]; - b11.i[12] = ((const int * ALIGNED(64))a12)[11]; - b12.i[12] = ((const int * ALIGNED(64))a12)[12]; - b13.i[12] = ((const int * ALIGNED(64))a12)[13]; - b14.i[12] = ((const int * ALIGNED(64))a12)[14]; - b15.i[12] = ((const int * ALIGNED(64))a12)[15]; - - b00.i[13] = ((const int * ALIGNED(64))a13)[ 0]; - b01.i[13] = ((const int * ALIGNED(64))a13)[ 1]; - b02.i[13] = ((const int * ALIGNED(64))a13)[ 2]; - b03.i[13] = ((const int * ALIGNED(64))a13)[ 3]; - b04.i[13] = ((const int * ALIGNED(64))a13)[ 4]; - b05.i[13] = ((const int * ALIGNED(64))a13)[ 5]; - b06.i[13] = ((const int * ALIGNED(64))a13)[ 6]; - b07.i[13] = ((const int * ALIGNED(64))a13)[ 7]; - b08.i[13] = ((const int * ALIGNED(64))a13)[ 8]; - b09.i[13] = ((const int * ALIGNED(64))a13)[ 9]; - b10.i[13] = ((const int * ALIGNED(64))a13)[10]; - b11.i[13] = ((const int * ALIGNED(64))a13)[11]; - b12.i[13] = ((const int * ALIGNED(64))a13)[12]; - b13.i[13] = ((const int * ALIGNED(64))a13)[13]; - b14.i[13] = ((const int * ALIGNED(64))a13)[14]; - b15.i[13] = ((const int * ALIGNED(64))a13)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a14)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a14)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a14)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a14)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a14)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a14)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a14)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a14)[ 7]; - b08.i[14] = ((const int * ALIGNED(64))a14)[ 8]; - b09.i[14] = ((const int * ALIGNED(64))a14)[ 9]; - b10.i[14] = ((const int * ALIGNED(64))a14)[10]; - b11.i[14] = ((const int * ALIGNED(64))a14)[11]; - b12.i[14] = ((const int * ALIGNED(64))a14)[12]; - b13.i[14] = ((const int * ALIGNED(64))a14)[13]; - b14.i[14] = ((const int * ALIGNED(64))a14)[14]; - b15.i[14] = ((const int * ALIGNED(64))a14)[15]; - - b00.i[15] = ((const int * ALIGNED(64))a15)[ 0]; - b01.i[15] = ((const int * ALIGNED(64))a15)[ 1]; - b02.i[15] = ((const int * ALIGNED(64))a15)[ 2]; - b03.i[15] = ((const int * ALIGNED(64))a15)[ 3]; - b04.i[15] = ((const int * ALIGNED(64))a15)[ 4]; - b05.i[15] = ((const int * ALIGNED(64))a15)[ 5]; - b06.i[15] = ((const int * ALIGNED(64))a15)[ 6]; - b07.i[15] = ((const int * ALIGNED(64))a15)[ 7]; - b08.i[15] = ((const int * ALIGNED(64))a15)[ 8]; - b09.i[15] = ((const int * ALIGNED(64))a15)[ 9]; - b10.i[15] = ((const int * ALIGNED(64))a15)[10]; - b11.i[15] = ((const int * ALIGNED(64))a15)[11]; - b12.i[15] = ((const int * ALIGNED(64))a15)[12]; - b13.i[15] = ((const int * ALIGNED(64))a15)[13]; - b14.i[15] = ((const int * ALIGNED(64))a15)[14]; - b15.i[15] = ((const int * ALIGNED(64))a15)[15]; - } - - inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8]; - b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9]; - b02.i[ 1] = ((const int * ALIGNED(64))a00)[10]; - b03.i[ 1] = ((const int * ALIGNED(64))a00)[11]; - b04.i[ 1] = ((const int * ALIGNED(64))a00)[12]; - b05.i[ 1] = ((const int * ALIGNED(64))a00)[13]; - b06.i[ 1] = ((const int * ALIGNED(64))a00)[14]; - b07.i[ 1] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7]; - b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8]; - b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9]; - b02.i[ 3] = ((const int * ALIGNED(64))a01)[10]; - b03.i[ 3] = ((const int * ALIGNED(64))a01)[11]; - b04.i[ 3] = ((const int * ALIGNED(64))a01)[12]; - b05.i[ 3] = ((const int * ALIGNED(64))a01)[13]; - b06.i[ 3] = ((const int * ALIGNED(64))a01)[14]; - b07.i[ 3] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7]; - b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8]; - b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9]; - b02.i[ 5] = ((const int * ALIGNED(64))a02)[10]; - b03.i[ 5] = ((const int * ALIGNED(64))a02)[11]; - b04.i[ 5] = ((const int * ALIGNED(64))a02)[12]; - b05.i[ 5] = ((const int * ALIGNED(64))a02)[13]; - b06.i[ 5] = ((const int * ALIGNED(64))a02)[14]; - b07.i[ 5] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7]; - b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8]; - b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9]; - b02.i[ 7] = ((const int * ALIGNED(64))a03)[10]; - b03.i[ 7] = ((const int * ALIGNED(64))a03)[11]; - b04.i[ 7] = ((const int * ALIGNED(64))a03)[12]; - b05.i[ 7] = ((const int * ALIGNED(64))a03)[13]; - b06.i[ 7] = ((const int * ALIGNED(64))a03)[14]; - b07.i[ 7] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7]; - b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8]; - b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9]; - b02.i[ 9] = ((const int * ALIGNED(64))a04)[10]; - b03.i[ 9] = ((const int * ALIGNED(64))a04)[11]; - b04.i[ 9] = ((const int * ALIGNED(64))a04)[12]; - b05.i[ 9] = ((const int * ALIGNED(64))a04)[13]; - b06.i[ 9] = ((const int * ALIGNED(64))a04)[14]; - b07.i[ 9] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a05)[ 7]; - b00.i[11] = ((const int * ALIGNED(64))a05)[ 8]; - b01.i[11] = ((const int * ALIGNED(64))a05)[ 9]; - b02.i[11] = ((const int * ALIGNED(64))a05)[10]; - b03.i[11] = ((const int * ALIGNED(64))a05)[11]; - b04.i[11] = ((const int * ALIGNED(64))a05)[12]; - b05.i[11] = ((const int * ALIGNED(64))a05)[13]; - b06.i[11] = ((const int * ALIGNED(64))a05)[14]; - b07.i[11] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a06)[ 7]; - b00.i[13] = ((const int * ALIGNED(64))a06)[ 8]; - b01.i[13] = ((const int * ALIGNED(64))a06)[ 9]; - b02.i[13] = ((const int * ALIGNED(64))a06)[10]; - b03.i[13] = ((const int * ALIGNED(64))a06)[11]; - b04.i[13] = ((const int * ALIGNED(64))a06)[12]; - b05.i[13] = ((const int * ALIGNED(64))a06)[13]; - b06.i[13] = ((const int * ALIGNED(64))a06)[14]; - b07.i[13] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a07)[ 7]; - b00.i[15] = ((const int * ALIGNED(64))a07)[ 8]; - b01.i[15] = ((const int * ALIGNED(64))a07)[ 9]; - b02.i[15] = ((const int * ALIGNED(64))a07)[10]; - b03.i[15] = ((const int * ALIGNED(64))a07)[11]; - b04.i[15] = ((const int * ALIGNED(64))a07)[12]; - b05.i[15] = ((const int * ALIGNED(64))a07)[13]; - b06.i[15] = ((const int * ALIGNED(64))a07)[14]; - b07.i[15] = ((const int * ALIGNED(64))a07)[15]; - } - - inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8]; - b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9]; - b02.i[ 1] = ((const int * ALIGNED(64))a00)[10]; - b03.i[ 1] = ((const int * ALIGNED(64))a00)[11]; - b04.i[ 1] = ((const int * ALIGNED(64))a00)[12]; - b05.i[ 1] = ((const int * ALIGNED(64))a00)[13]; - b06.i[ 1] = ((const int * ALIGNED(64))a00)[14]; - b07.i[ 1] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7]; - b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8]; - b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9]; - b02.i[ 3] = ((const int * ALIGNED(64))a01)[10]; - b03.i[ 3] = ((const int * ALIGNED(64))a01)[11]; - b04.i[ 3] = ((const int * ALIGNED(64))a01)[12]; - b05.i[ 3] = ((const int * ALIGNED(64))a01)[13]; - b06.i[ 3] = ((const int * ALIGNED(64))a01)[14]; - b07.i[ 3] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7]; - b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8]; - b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9]; - b02.i[ 5] = ((const int * ALIGNED(64))a02)[10]; - b03.i[ 5] = ((const int * ALIGNED(64))a02)[11]; - b04.i[ 5] = ((const int * ALIGNED(64))a02)[12]; - b05.i[ 5] = ((const int * ALIGNED(64))a02)[13]; - b06.i[ 5] = ((const int * ALIGNED(64))a02)[14]; - b07.i[ 5] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7]; - b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8]; - b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9]; - b02.i[ 7] = ((const int * ALIGNED(64))a03)[10]; - b03.i[ 7] = ((const int * ALIGNED(64))a03)[11]; - b04.i[ 7] = ((const int * ALIGNED(64))a03)[12]; - b05.i[ 7] = ((const int * ALIGNED(64))a03)[13]; - b06.i[ 7] = ((const int * ALIGNED(64))a03)[14]; - b07.i[ 7] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7]; - b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8]; - b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9]; - b02.i[ 9] = ((const int * ALIGNED(64))a04)[10]; - b03.i[ 9] = ((const int * ALIGNED(64))a04)[11]; - b04.i[ 9] = ((const int * ALIGNED(64))a04)[12]; - b05.i[ 9] = ((const int * ALIGNED(64))a04)[13]; - b06.i[ 9] = ((const int * ALIGNED(64))a04)[14]; - b07.i[ 9] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a05)[ 7]; - b00.i[11] = ((const int * ALIGNED(64))a05)[ 8]; - b01.i[11] = ((const int * ALIGNED(64))a05)[ 9]; - b02.i[11] = ((const int * ALIGNED(64))a05)[10]; - b03.i[11] = ((const int * ALIGNED(64))a05)[11]; - b04.i[11] = ((const int * ALIGNED(64))a05)[12]; - b05.i[11] = ((const int * ALIGNED(64))a05)[13]; - b06.i[11] = ((const int * ALIGNED(64))a05)[14]; - b07.i[11] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a06)[ 7]; - b00.i[13] = ((const int * ALIGNED(64))a06)[ 8]; - b01.i[13] = ((const int * ALIGNED(64))a06)[ 9]; - b02.i[13] = ((const int * ALIGNED(64))a06)[10]; - b03.i[13] = ((const int * ALIGNED(64))a06)[11]; - b04.i[13] = ((const int * ALIGNED(64))a06)[12]; - b05.i[13] = ((const int * ALIGNED(64))a06)[13]; - b06.i[13] = ((const int * ALIGNED(64))a06)[14]; - b07.i[13] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a07)[ 7]; - b00.i[15] = ((const int * ALIGNED(64))a07)[ 8]; - b01.i[15] = ((const int * ALIGNED(64))a07)[ 9]; - b02.i[15] = ((const int * ALIGNED(64))a07)[10]; - b03.i[15] = ((const int * ALIGNED(64))a07)[11]; - b04.i[15] = ((const int * ALIGNED(64))a07)[12]; - b05.i[15] = ((const int * ALIGNED(64))a07)[13]; - b06.i[15] = ((const int * ALIGNED(64))a07)[14]; - b07.i[15] = ((const int * ALIGNED(64))a07)[15]; - - b08.i[ 0] = ((const int * ALIGNED(64))a08)[ 0]; - b09.i[ 0] = ((const int * ALIGNED(64))a08)[ 1]; - b10.i[ 0] = ((const int * ALIGNED(64))a08)[ 2]; - b11.i[ 0] = ((const int * ALIGNED(64))a08)[ 3]; - b12.i[ 0] = ((const int * ALIGNED(64))a08)[ 4]; - b13.i[ 0] = ((const int * ALIGNED(64))a08)[ 5]; - b14.i[ 0] = ((const int * ALIGNED(64))a08)[ 6]; - b15.i[ 0] = ((const int * ALIGNED(64))a08)[ 7]; - b08.i[ 1] = ((const int * ALIGNED(64))a08)[ 8]; - b09.i[ 1] = ((const int * ALIGNED(64))a08)[ 9]; - b10.i[ 1] = ((const int * ALIGNED(64))a08)[10]; - b11.i[ 1] = ((const int * ALIGNED(64))a08)[11]; - b12.i[ 1] = ((const int * ALIGNED(64))a08)[12]; - b13.i[ 1] = ((const int * ALIGNED(64))a08)[13]; - b14.i[ 1] = ((const int * ALIGNED(64))a08)[14]; - b15.i[ 1] = ((const int * ALIGNED(64))a08)[15]; - - b08.i[ 2] = ((const int * ALIGNED(64))a09)[ 0]; - b09.i[ 2] = ((const int * ALIGNED(64))a09)[ 1]; - b10.i[ 2] = ((const int * ALIGNED(64))a09)[ 2]; - b11.i[ 2] = ((const int * ALIGNED(64))a09)[ 3]; - b12.i[ 2] = ((const int * ALIGNED(64))a09)[ 4]; - b13.i[ 2] = ((const int * ALIGNED(64))a09)[ 5]; - b14.i[ 2] = ((const int * ALIGNED(64))a09)[ 6]; - b15.i[ 2] = ((const int * ALIGNED(64))a09)[ 7]; - b08.i[ 3] = ((const int * ALIGNED(64))a09)[ 8]; - b09.i[ 3] = ((const int * ALIGNED(64))a09)[ 9]; - b10.i[ 3] = ((const int * ALIGNED(64))a09)[10]; - b11.i[ 3] = ((const int * ALIGNED(64))a09)[11]; - b12.i[ 3] = ((const int * ALIGNED(64))a09)[12]; - b13.i[ 3] = ((const int * ALIGNED(64))a09)[13]; - b14.i[ 3] = ((const int * ALIGNED(64))a09)[14]; - b15.i[ 3] = ((const int * ALIGNED(64))a09)[15]; - - b08.i[ 4] = ((const int * ALIGNED(64))a10)[ 0]; - b09.i[ 4] = ((const int * ALIGNED(64))a10)[ 1]; - b10.i[ 4] = ((const int * ALIGNED(64))a10)[ 2]; - b11.i[ 4] = ((const int * ALIGNED(64))a10)[ 3]; - b12.i[ 4] = ((const int * ALIGNED(64))a10)[ 4]; - b13.i[ 4] = ((const int * ALIGNED(64))a10)[ 5]; - b14.i[ 4] = ((const int * ALIGNED(64))a10)[ 6]; - b15.i[ 4] = ((const int * ALIGNED(64))a10)[ 7]; - b08.i[ 5] = ((const int * ALIGNED(64))a10)[ 8]; - b09.i[ 5] = ((const int * ALIGNED(64))a10)[ 9]; - b10.i[ 5] = ((const int * ALIGNED(64))a10)[10]; - b11.i[ 5] = ((const int * ALIGNED(64))a10)[11]; - b12.i[ 5] = ((const int * ALIGNED(64))a10)[12]; - b13.i[ 5] = ((const int * ALIGNED(64))a10)[13]; - b14.i[ 5] = ((const int * ALIGNED(64))a10)[14]; - b15.i[ 5] = ((const int * ALIGNED(64))a10)[15]; - - b08.i[ 6] = ((const int * ALIGNED(64))a11)[ 0]; - b09.i[ 6] = ((const int * ALIGNED(64))a11)[ 1]; - b10.i[ 6] = ((const int * ALIGNED(64))a11)[ 2]; - b11.i[ 6] = ((const int * ALIGNED(64))a11)[ 3]; - b12.i[ 6] = ((const int * ALIGNED(64))a11)[ 4]; - b13.i[ 6] = ((const int * ALIGNED(64))a11)[ 5]; - b14.i[ 6] = ((const int * ALIGNED(64))a11)[ 6]; - b15.i[ 6] = ((const int * ALIGNED(64))a11)[ 7]; - b08.i[ 7] = ((const int * ALIGNED(64))a11)[ 8]; - b09.i[ 7] = ((const int * ALIGNED(64))a11)[ 9]; - b10.i[ 7] = ((const int * ALIGNED(64))a11)[10]; - b11.i[ 7] = ((const int * ALIGNED(64))a11)[11]; - b12.i[ 7] = ((const int * ALIGNED(64))a11)[12]; - b13.i[ 7] = ((const int * ALIGNED(64))a11)[13]; - b14.i[ 7] = ((const int * ALIGNED(64))a11)[14]; - b15.i[ 7] = ((const int * ALIGNED(64))a11)[15]; - - b08.i[ 8] = ((const int * ALIGNED(64))a12)[ 0]; - b09.i[ 8] = ((const int * ALIGNED(64))a12)[ 1]; - b10.i[ 8] = ((const int * ALIGNED(64))a12)[ 2]; - b11.i[ 8] = ((const int * ALIGNED(64))a12)[ 3]; - b12.i[ 8] = ((const int * ALIGNED(64))a12)[ 4]; - b13.i[ 8] = ((const int * ALIGNED(64))a12)[ 5]; - b14.i[ 8] = ((const int * ALIGNED(64))a12)[ 6]; - b15.i[ 8] = ((const int * ALIGNED(64))a12)[ 7]; - b08.i[ 9] = ((const int * ALIGNED(64))a12)[ 8]; - b09.i[ 9] = ((const int * ALIGNED(64))a12)[ 9]; - b10.i[ 9] = ((const int * ALIGNED(64))a12)[10]; - b11.i[ 9] = ((const int * ALIGNED(64))a12)[11]; - b12.i[ 9] = ((const int * ALIGNED(64))a12)[12]; - b13.i[ 9] = ((const int * ALIGNED(64))a12)[13]; - b14.i[ 9] = ((const int * ALIGNED(64))a12)[14]; - b15.i[ 9] = ((const int * ALIGNED(64))a12)[15]; - - b08.i[10] = ((const int * ALIGNED(64))a13)[ 0]; - b09.i[10] = ((const int * ALIGNED(64))a13)[ 1]; - b10.i[10] = ((const int * ALIGNED(64))a13)[ 2]; - b11.i[10] = ((const int * ALIGNED(64))a13)[ 3]; - b12.i[10] = ((const int * ALIGNED(64))a13)[ 4]; - b13.i[10] = ((const int * ALIGNED(64))a13)[ 5]; - b14.i[10] = ((const int * ALIGNED(64))a13)[ 6]; - b15.i[10] = ((const int * ALIGNED(64))a13)[ 7]; - b08.i[11] = ((const int * ALIGNED(64))a13)[ 8]; - b09.i[11] = ((const int * ALIGNED(64))a13)[ 9]; - b10.i[11] = ((const int * ALIGNED(64))a13)[10]; - b11.i[11] = ((const int * ALIGNED(64))a13)[11]; - b12.i[11] = ((const int * ALIGNED(64))a13)[12]; - b13.i[11] = ((const int * ALIGNED(64))a13)[13]; - b14.i[11] = ((const int * ALIGNED(64))a13)[14]; - b15.i[11] = ((const int * ALIGNED(64))a13)[15]; - - b08.i[12] = ((const int * ALIGNED(64))a14)[ 0]; - b09.i[12] = ((const int * ALIGNED(64))a14)[ 1]; - b10.i[12] = ((const int * ALIGNED(64))a14)[ 2]; - b11.i[12] = ((const int * ALIGNED(64))a14)[ 3]; - b12.i[12] = ((const int * ALIGNED(64))a14)[ 4]; - b13.i[12] = ((const int * ALIGNED(64))a14)[ 5]; - b14.i[12] = ((const int * ALIGNED(64))a14)[ 6]; - b15.i[12] = ((const int * ALIGNED(64))a14)[ 7]; - b08.i[13] = ((const int * ALIGNED(64))a14)[ 8]; - b09.i[13] = ((const int * ALIGNED(64))a14)[ 9]; - b10.i[13] = ((const int * ALIGNED(64))a14)[10]; - b11.i[13] = ((const int * ALIGNED(64))a14)[11]; - b12.i[13] = ((const int * ALIGNED(64))a14)[12]; - b13.i[13] = ((const int * ALIGNED(64))a14)[13]; - b14.i[13] = ((const int * ALIGNED(64))a14)[14]; - b15.i[13] = ((const int * ALIGNED(64))a14)[15]; - - b08.i[14] = ((const int * ALIGNED(64))a15)[ 0]; - b09.i[14] = ((const int * ALIGNED(64))a15)[ 1]; - b10.i[14] = ((const int * ALIGNED(64))a15)[ 2]; - b11.i[14] = ((const int * ALIGNED(64))a15)[ 3]; - b12.i[14] = ((const int * ALIGNED(64))a15)[ 4]; - b13.i[14] = ((const int * ALIGNED(64))a15)[ 5]; - b14.i[14] = ((const int * ALIGNED(64))a15)[ 6]; - b15.i[14] = ((const int * ALIGNED(64))a15)[ 7]; - b08.i[15] = ((const int * ALIGNED(64))a15)[ 8]; - b09.i[15] = ((const int * ALIGNED(64))a15)[ 9]; - b10.i[15] = ((const int * ALIGNED(64))a15)[10]; - b11.i[15] = ((const int * ALIGNED(64))a15)[11]; - b12.i[15] = ((const int * ALIGNED(64))a15)[12]; - b13.i[15] = ((const int * ALIGNED(64))a15)[13]; - b14.i[15] = ((const int * ALIGNED(64))a15)[14]; - b15.i[15] = ((const int * ALIGNED(64))a15)[15]; - } - - inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) - { - ((int *)a00)[0] = a.i[ 0]; - ((int *)a01)[0] = a.i[ 1]; - ((int *)a02)[0] = a.i[ 2]; - ((int *)a03)[0] = a.i[ 3]; - ((int *)a04)[0] = a.i[ 4]; - ((int *)a05)[0] = a.i[ 5]; - ((int *)a06)[0] = a.i[ 6]; - ((int *)a07)[0] = a.i[ 7]; - ((int *)a08)[0] = a.i[ 8]; - ((int *)a09)[0] = a.i[ 9]; - ((int *)a10)[0] = a.i[10]; - ((int *)a11)[0] = a.i[11]; - ((int *)a12)[0] = a.i[12]; - ((int *)a13)[0] = a.i[13]; - ((int *)a14)[0] = a.i[14]; - ((int *)a15)[0] = a.i[15]; - } - - inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, void * ALIGNED(8) a01, - void * ALIGNED(8) a02, void * ALIGNED(8) a03, - void * ALIGNED(8) a04, void * ALIGNED(8) a05, - void * ALIGNED(8) a06, void * ALIGNED(8) a07, - void * ALIGNED(8) a08, void * ALIGNED(8) a09, - void * ALIGNED(8) a10, void * ALIGNED(8) a11, - void * ALIGNED(8) a12, void * ALIGNED(8) a13, - void * ALIGNED(8) a14, void * ALIGNED(8) a15 ) - { - ((int * ALIGNED(8))a00)[0] = a.i[ 0]; - ((int * ALIGNED(8))a00)[1] = b.i[ 0]; - - ((int * ALIGNED(8))a01)[0] = a.i[ 1]; - ((int * ALIGNED(8))a01)[1] = b.i[ 1]; - - ((int * ALIGNED(8))a02)[0] = a.i[ 2]; - ((int * ALIGNED(8))a02)[1] = b.i[ 2]; - - ((int * ALIGNED(8))a03)[0] = a.i[ 3]; - ((int * ALIGNED(8))a03)[1] = b.i[ 3]; - - ((int * ALIGNED(8))a04)[0] = a.i[ 4]; - ((int * ALIGNED(8))a04)[1] = b.i[ 4]; - - ((int * ALIGNED(8))a05)[0] = a.i[ 5]; - ((int * ALIGNED(8))a05)[1] = b.i[ 5]; - - ((int * ALIGNED(8))a06)[0] = a.i[ 6]; - ((int * ALIGNED(8))a06)[1] = b.i[ 6]; - - ((int * ALIGNED(8))a07)[0] = a.i[ 7]; - ((int * ALIGNED(8))a07)[1] = b.i[ 7]; - - ((int * ALIGNED(8))a08)[0] = a.i[ 8]; - ((int * ALIGNED(8))a08)[1] = b.i[ 8]; - - ((int * ALIGNED(8))a09)[0] = a.i[ 9]; - ((int * ALIGNED(8))a09)[1] = b.i[ 9]; - - ((int * ALIGNED(8))a10)[0] = a.i[10]; - ((int * ALIGNED(8))a10)[1] = b.i[10]; - - ((int * ALIGNED(8))a11)[0] = a.i[11]; - ((int * ALIGNED(8))a11)[1] = b.i[11]; - - ((int * ALIGNED(8))a12)[0] = a.i[12]; - ((int * ALIGNED(8))a12)[1] = b.i[12]; - - ((int * ALIGNED(8))a13)[0] = a.i[13]; - ((int * ALIGNED(8))a13)[1] = b.i[13]; - - ((int * ALIGNED(8))a14)[0] = a.i[14]; - ((int * ALIGNED(8))a14)[1] = b.i[14]; - - ((int * ALIGNED(8))a15)[0] = a.i[15]; - ((int * ALIGNED(8))a15)[1] = b.i[15]; - } - - inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - } - - inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - } - - inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - const v16 &e, const v16 &f, const v16 &g, const v16 &h, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - ((int * ALIGNED(64))a00)[4] = e.i[ 0]; - ((int * ALIGNED(64))a00)[5] = f.i[ 0]; - ((int * ALIGNED(64))a00)[6] = g.i[ 0]; - ((int * ALIGNED(64))a00)[7] = h.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - ((int * ALIGNED(64))a01)[4] = e.i[ 1]; - ((int * ALIGNED(64))a01)[5] = f.i[ 1]; - ((int * ALIGNED(64))a01)[6] = g.i[ 1]; - ((int * ALIGNED(64))a01)[7] = h.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - ((int * ALIGNED(64))a02)[4] = e.i[ 2]; - ((int * ALIGNED(64))a02)[5] = f.i[ 2]; - ((int * ALIGNED(64))a02)[6] = g.i[ 2]; - ((int * ALIGNED(64))a02)[7] = h.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - ((int * ALIGNED(64))a03)[4] = e.i[ 3]; - ((int * ALIGNED(64))a03)[5] = f.i[ 3]; - ((int * ALIGNED(64))a03)[6] = g.i[ 3]; - ((int * ALIGNED(64))a03)[7] = h.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - ((int * ALIGNED(64))a04)[4] = e.i[ 4]; - ((int * ALIGNED(64))a04)[5] = f.i[ 4]; - ((int * ALIGNED(64))a04)[6] = g.i[ 4]; - ((int * ALIGNED(64))a04)[7] = h.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - ((int * ALIGNED(64))a05)[4] = e.i[ 5]; - ((int * ALIGNED(64))a05)[5] = f.i[ 5]; - ((int * ALIGNED(64))a05)[6] = g.i[ 5]; - ((int * ALIGNED(64))a05)[7] = h.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - ((int * ALIGNED(64))a06)[4] = e.i[ 6]; - ((int * ALIGNED(64))a06)[5] = f.i[ 6]; - ((int * ALIGNED(64))a06)[6] = g.i[ 6]; - ((int * ALIGNED(64))a06)[7] = h.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - ((int * ALIGNED(64))a07)[4] = e.i[ 7]; - ((int * ALIGNED(64))a07)[5] = f.i[ 7]; - ((int * ALIGNED(64))a07)[6] = g.i[ 7]; - ((int * ALIGNED(64))a07)[7] = h.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - ((int * ALIGNED(64))a08)[4] = e.i[ 8]; - ((int * ALIGNED(64))a08)[5] = f.i[ 8]; - ((int * ALIGNED(64))a08)[6] = g.i[ 8]; - ((int * ALIGNED(64))a08)[7] = h.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - ((int * ALIGNED(64))a09)[4] = e.i[ 9]; - ((int * ALIGNED(64))a09)[5] = f.i[ 9]; - ((int * ALIGNED(64))a09)[6] = g.i[ 9]; - ((int * ALIGNED(64))a09)[7] = h.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - ((int * ALIGNED(64))a10)[4] = e.i[10]; - ((int * ALIGNED(64))a10)[5] = f.i[10]; - ((int * ALIGNED(64))a10)[6] = g.i[10]; - ((int * ALIGNED(64))a10)[7] = h.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - ((int * ALIGNED(64))a11)[4] = e.i[11]; - ((int * ALIGNED(64))a11)[5] = f.i[11]; - ((int * ALIGNED(64))a11)[6] = g.i[11]; - ((int * ALIGNED(64))a11)[7] = h.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - ((int * ALIGNED(64))a12)[4] = e.i[12]; - ((int * ALIGNED(64))a12)[5] = f.i[12]; - ((int * ALIGNED(64))a12)[6] = g.i[12]; - ((int * ALIGNED(64))a12)[7] = h.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - ((int * ALIGNED(64))a13)[4] = e.i[13]; - ((int * ALIGNED(64))a13)[5] = f.i[13]; - ((int * ALIGNED(64))a13)[6] = g.i[13]; - ((int * ALIGNED(64))a13)[7] = h.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - ((int * ALIGNED(64))a14)[4] = e.i[14]; - ((int * ALIGNED(64))a14)[5] = f.i[14]; - ((int * ALIGNED(64))a14)[6] = g.i[14]; - ((int * ALIGNED(64))a14)[7] = h.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - ((int * ALIGNED(64))a15)[4] = e.i[15]; - ((int * ALIGNED(64))a15)[5] = f.i[15]; - ((int * ALIGNED(64))a15)[6] = g.i[15]; - ((int * ALIGNED(64))a15)[7] = h.i[15]; - } - - inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b08.i[ 0]; - ((int * ALIGNED(64))a00)[ 9] = b09.i[ 0]; - ((int * ALIGNED(64))a00)[10] = b10.i[ 0]; - ((int * ALIGNED(64))a00)[11] = b11.i[ 0]; - ((int * ALIGNED(64))a00)[12] = b12.i[ 0]; - ((int * ALIGNED(64))a00)[13] = b13.i[ 0]; - ((int * ALIGNED(64))a00)[14] = b14.i[ 0]; - ((int * ALIGNED(64))a00)[15] = b15.i[ 0]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 1]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 1]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 1]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 1]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 1]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 1]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 1]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 1]; - ((int * ALIGNED(64))a01)[ 8] = b08.i[ 1]; - ((int * ALIGNED(64))a01)[ 9] = b09.i[ 1]; - ((int * ALIGNED(64))a01)[10] = b10.i[ 1]; - ((int * ALIGNED(64))a01)[11] = b11.i[ 1]; - ((int * ALIGNED(64))a01)[12] = b12.i[ 1]; - ((int * ALIGNED(64))a01)[13] = b13.i[ 1]; - ((int * ALIGNED(64))a01)[14] = b14.i[ 1]; - ((int * ALIGNED(64))a01)[15] = b15.i[ 1]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a02)[ 8] = b08.i[ 2]; - ((int * ALIGNED(64))a02)[ 9] = b09.i[ 2]; - ((int * ALIGNED(64))a02)[10] = b10.i[ 2]; - ((int * ALIGNED(64))a02)[11] = b11.i[ 2]; - ((int * ALIGNED(64))a02)[12] = b12.i[ 2]; - ((int * ALIGNED(64))a02)[13] = b13.i[ 2]; - ((int * ALIGNED(64))a02)[14] = b14.i[ 2]; - ((int * ALIGNED(64))a02)[15] = b15.i[ 2]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 3]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 3]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 3]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 3]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 3]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 3]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 3]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 3]; - ((int * ALIGNED(64))a03)[ 8] = b08.i[ 3]; - ((int * ALIGNED(64))a03)[ 9] = b09.i[ 3]; - ((int * ALIGNED(64))a03)[10] = b10.i[ 3]; - ((int * ALIGNED(64))a03)[11] = b11.i[ 3]; - ((int * ALIGNED(64))a03)[12] = b12.i[ 3]; - ((int * ALIGNED(64))a03)[13] = b13.i[ 3]; - ((int * ALIGNED(64))a03)[14] = b14.i[ 3]; - ((int * ALIGNED(64))a03)[15] = b15.i[ 3]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a04)[ 8] = b08.i[ 4]; - ((int * ALIGNED(64))a04)[ 9] = b09.i[ 4]; - ((int * ALIGNED(64))a04)[10] = b10.i[ 4]; - ((int * ALIGNED(64))a04)[11] = b11.i[ 4]; - ((int * ALIGNED(64))a04)[12] = b12.i[ 4]; - ((int * ALIGNED(64))a04)[13] = b13.i[ 4]; - ((int * ALIGNED(64))a04)[14] = b14.i[ 4]; - ((int * ALIGNED(64))a04)[15] = b15.i[ 4]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[ 5]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[ 5]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[ 5]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[ 5]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[ 5]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[ 5]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[ 5]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[ 5]; - ((int * ALIGNED(64))a05)[ 8] = b08.i[ 5]; - ((int * ALIGNED(64))a05)[ 9] = b09.i[ 5]; - ((int * ALIGNED(64))a05)[10] = b10.i[ 5]; - ((int * ALIGNED(64))a05)[11] = b11.i[ 5]; - ((int * ALIGNED(64))a05)[12] = b12.i[ 5]; - ((int * ALIGNED(64))a05)[13] = b13.i[ 5]; - ((int * ALIGNED(64))a05)[14] = b14.i[ 5]; - ((int * ALIGNED(64))a05)[15] = b15.i[ 5]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a06)[ 8] = b08.i[ 6]; - ((int * ALIGNED(64))a06)[ 9] = b09.i[ 6]; - ((int * ALIGNED(64))a06)[10] = b10.i[ 6]; - ((int * ALIGNED(64))a06)[11] = b11.i[ 6]; - ((int * ALIGNED(64))a06)[12] = b12.i[ 6]; - ((int * ALIGNED(64))a06)[13] = b13.i[ 6]; - ((int * ALIGNED(64))a06)[14] = b14.i[ 6]; - ((int * ALIGNED(64))a06)[15] = b15.i[ 6]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[ 7]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[ 7]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[ 7]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[ 7]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[ 7]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[ 7]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[ 7]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[ 7]; - ((int * ALIGNED(64))a07)[ 8] = b08.i[ 7]; - ((int * ALIGNED(64))a07)[ 9] = b09.i[ 7]; - ((int * ALIGNED(64))a07)[10] = b10.i[ 7]; - ((int * ALIGNED(64))a07)[11] = b11.i[ 7]; - ((int * ALIGNED(64))a07)[12] = b12.i[ 7]; - ((int * ALIGNED(64))a07)[13] = b13.i[ 7]; - ((int * ALIGNED(64))a07)[14] = b14.i[ 7]; - ((int * ALIGNED(64))a07)[15] = b15.i[ 7]; - - ((int * ALIGNED(64))a08)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a08)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a08)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a08)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a08)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a08)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a08)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a08)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a08)[ 8] = b08.i[ 8]; - ((int * ALIGNED(64))a08)[ 9] = b09.i[ 8]; - ((int * ALIGNED(64))a08)[10] = b10.i[ 8]; - ((int * ALIGNED(64))a08)[11] = b11.i[ 8]; - ((int * ALIGNED(64))a08)[12] = b12.i[ 8]; - ((int * ALIGNED(64))a08)[13] = b13.i[ 8]; - ((int * ALIGNED(64))a08)[14] = b14.i[ 8]; - ((int * ALIGNED(64))a08)[15] = b15.i[ 8]; - - ((int * ALIGNED(64))a09)[ 0] = b00.i[ 9]; - ((int * ALIGNED(64))a09)[ 1] = b01.i[ 9]; - ((int * ALIGNED(64))a09)[ 2] = b02.i[ 9]; - ((int * ALIGNED(64))a09)[ 3] = b03.i[ 9]; - ((int * ALIGNED(64))a09)[ 4] = b04.i[ 9]; - ((int * ALIGNED(64))a09)[ 5] = b05.i[ 9]; - ((int * ALIGNED(64))a09)[ 6] = b06.i[ 9]; - ((int * ALIGNED(64))a09)[ 7] = b07.i[ 9]; - ((int * ALIGNED(64))a09)[ 8] = b08.i[ 9]; - ((int * ALIGNED(64))a09)[ 9] = b09.i[ 9]; - ((int * ALIGNED(64))a09)[10] = b10.i[ 9]; - ((int * ALIGNED(64))a09)[11] = b11.i[ 9]; - ((int * ALIGNED(64))a09)[12] = b12.i[ 9]; - ((int * ALIGNED(64))a09)[13] = b13.i[ 9]; - ((int * ALIGNED(64))a09)[14] = b14.i[ 9]; - ((int * ALIGNED(64))a09)[15] = b15.i[ 9]; - - ((int * ALIGNED(64))a10)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a10)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a10)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a10)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a10)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a10)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a10)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a10)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a10)[ 8] = b08.i[10]; - ((int * ALIGNED(64))a10)[ 9] = b09.i[10]; - ((int * ALIGNED(64))a10)[10] = b10.i[10]; - ((int * ALIGNED(64))a10)[11] = b11.i[10]; - ((int * ALIGNED(64))a10)[12] = b12.i[10]; - ((int * ALIGNED(64))a10)[13] = b13.i[10]; - ((int * ALIGNED(64))a10)[14] = b14.i[10]; - ((int * ALIGNED(64))a10)[15] = b15.i[10]; - - ((int * ALIGNED(64))a11)[ 0] = b00.i[11]; - ((int * ALIGNED(64))a11)[ 1] = b01.i[11]; - ((int * ALIGNED(64))a11)[ 2] = b02.i[11]; - ((int * ALIGNED(64))a11)[ 3] = b03.i[11]; - ((int * ALIGNED(64))a11)[ 4] = b04.i[11]; - ((int * ALIGNED(64))a11)[ 5] = b05.i[11]; - ((int * ALIGNED(64))a11)[ 6] = b06.i[11]; - ((int * ALIGNED(64))a11)[ 7] = b07.i[11]; - ((int * ALIGNED(64))a11)[ 8] = b08.i[11]; - ((int * ALIGNED(64))a11)[ 9] = b09.i[11]; - ((int * ALIGNED(64))a11)[10] = b10.i[11]; - ((int * ALIGNED(64))a11)[11] = b11.i[11]; - ((int * ALIGNED(64))a11)[12] = b12.i[11]; - ((int * ALIGNED(64))a11)[13] = b13.i[11]; - ((int * ALIGNED(64))a11)[14] = b14.i[11]; - ((int * ALIGNED(64))a11)[15] = b15.i[11]; - - ((int * ALIGNED(64))a12)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a12)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a12)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a12)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a12)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a12)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a12)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a12)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a12)[ 8] = b08.i[12]; - ((int * ALIGNED(64))a12)[ 9] = b09.i[12]; - ((int * ALIGNED(64))a12)[10] = b10.i[12]; - ((int * ALIGNED(64))a12)[11] = b11.i[12]; - ((int * ALIGNED(64))a12)[12] = b12.i[12]; - ((int * ALIGNED(64))a12)[13] = b13.i[12]; - ((int * ALIGNED(64))a12)[14] = b14.i[12]; - ((int * ALIGNED(64))a12)[15] = b15.i[12]; - - ((int * ALIGNED(64))a13)[ 0] = b00.i[13]; - ((int * ALIGNED(64))a13)[ 1] = b01.i[13]; - ((int * ALIGNED(64))a13)[ 2] = b02.i[13]; - ((int * ALIGNED(64))a13)[ 3] = b03.i[13]; - ((int * ALIGNED(64))a13)[ 4] = b04.i[13]; - ((int * ALIGNED(64))a13)[ 5] = b05.i[13]; - ((int * ALIGNED(64))a13)[ 6] = b06.i[13]; - ((int * ALIGNED(64))a13)[ 7] = b07.i[13]; - ((int * ALIGNED(64))a13)[ 8] = b08.i[13]; - ((int * ALIGNED(64))a13)[ 9] = b09.i[13]; - ((int * ALIGNED(64))a13)[10] = b10.i[13]; - ((int * ALIGNED(64))a13)[11] = b11.i[13]; - ((int * ALIGNED(64))a13)[12] = b12.i[13]; - ((int * ALIGNED(64))a13)[13] = b13.i[13]; - ((int * ALIGNED(64))a13)[14] = b14.i[13]; - ((int * ALIGNED(64))a13)[15] = b15.i[13]; - - ((int * ALIGNED(64))a14)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a14)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a14)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a14)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a14)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a14)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a14)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a14)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a14)[ 8] = b08.i[14]; - ((int * ALIGNED(64))a14)[ 9] = b09.i[14]; - ((int * ALIGNED(64))a14)[10] = b10.i[14]; - ((int * ALIGNED(64))a14)[11] = b11.i[14]; - ((int * ALIGNED(64))a14)[12] = b12.i[14]; - ((int * ALIGNED(64))a14)[13] = b13.i[14]; - ((int * ALIGNED(64))a14)[14] = b14.i[14]; - ((int * ALIGNED(64))a14)[15] = b15.i[14]; - - ((int * ALIGNED(64))a15)[ 0] = b00.i[15]; - ((int * ALIGNED(64))a15)[ 1] = b01.i[15]; - ((int * ALIGNED(64))a15)[ 2] = b02.i[15]; - ((int * ALIGNED(64))a15)[ 3] = b03.i[15]; - ((int * ALIGNED(64))a15)[ 4] = b04.i[15]; - ((int * ALIGNED(64))a15)[ 5] = b05.i[15]; - ((int * ALIGNED(64))a15)[ 6] = b06.i[15]; - ((int * ALIGNED(64))a15)[ 7] = b07.i[15]; - ((int * ALIGNED(64))a15)[ 8] = b08.i[15]; - ((int * ALIGNED(64))a15)[ 9] = b09.i[15]; - ((int * ALIGNED(64))a15)[10] = b10.i[15]; - ((int * ALIGNED(64))a15)[11] = b11.i[15]; - ((int * ALIGNED(64))a15)[12] = b12.i[15]; - ((int * ALIGNED(64))a15)[13] = b13.i[15]; - ((int * ALIGNED(64))a15)[14] = b14.i[15]; - ((int * ALIGNED(64))a15)[15] = b15.i[15]; - } - - inline void store_16x8_tr_p( const v16 &b00, - const v16 &b01, - const v16 &b02, - const v16 &b03, - const v16 &b04, - const v16 &b05, - const v16 &b06, - const v16 &b07, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1]; - ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1]; - ((int * ALIGNED(64))a00)[10] = b02.i[ 1]; - ((int * ALIGNED(64))a00)[11] = b03.i[ 1]; - ((int * ALIGNED(64))a00)[12] = b04.i[ 1]; - ((int * ALIGNED(64))a00)[13] = b05.i[ 1]; - ((int * ALIGNED(64))a00)[14] = b06.i[ 1]; - ((int * ALIGNED(64))a00)[15] = b07.i[ 1]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3]; - ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3]; - ((int * ALIGNED(64))a01)[10] = b02.i[ 3]; - ((int * ALIGNED(64))a01)[11] = b03.i[ 3]; - ((int * ALIGNED(64))a01)[12] = b04.i[ 3]; - ((int * ALIGNED(64))a01)[13] = b05.i[ 3]; - ((int * ALIGNED(64))a01)[14] = b06.i[ 3]; - ((int * ALIGNED(64))a01)[15] = b07.i[ 3]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5]; - ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5]; - ((int * ALIGNED(64))a02)[10] = b02.i[ 5]; - ((int * ALIGNED(64))a02)[11] = b03.i[ 5]; - ((int * ALIGNED(64))a02)[12] = b04.i[ 5]; - ((int * ALIGNED(64))a02)[13] = b05.i[ 5]; - ((int * ALIGNED(64))a02)[14] = b06.i[ 5]; - ((int * ALIGNED(64))a02)[15] = b07.i[ 5]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7]; - ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7]; - ((int * ALIGNED(64))a03)[10] = b02.i[ 7]; - ((int * ALIGNED(64))a03)[11] = b03.i[ 7]; - ((int * ALIGNED(64))a03)[12] = b04.i[ 7]; - ((int * ALIGNED(64))a03)[13] = b05.i[ 7]; - ((int * ALIGNED(64))a03)[14] = b06.i[ 7]; - ((int * ALIGNED(64))a03)[15] = b07.i[ 7]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9]; - ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9]; - ((int * ALIGNED(64))a04)[10] = b02.i[ 9]; - ((int * ALIGNED(64))a04)[11] = b03.i[ 9]; - ((int * ALIGNED(64))a04)[12] = b04.i[ 9]; - ((int * ALIGNED(64))a04)[13] = b05.i[ 9]; - ((int * ALIGNED(64))a04)[14] = b06.i[ 9]; - ((int * ALIGNED(64))a04)[15] = b07.i[ 9]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a05)[ 8] = b00.i[11]; - ((int * ALIGNED(64))a05)[ 9] = b01.i[11]; - ((int * ALIGNED(64))a05)[10] = b02.i[11]; - ((int * ALIGNED(64))a05)[11] = b03.i[11]; - ((int * ALIGNED(64))a05)[12] = b04.i[11]; - ((int * ALIGNED(64))a05)[13] = b05.i[11]; - ((int * ALIGNED(64))a05)[14] = b06.i[11]; - ((int * ALIGNED(64))a05)[15] = b07.i[11]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a06)[ 8] = b00.i[13]; - ((int * ALIGNED(64))a06)[ 9] = b01.i[13]; - ((int * ALIGNED(64))a06)[10] = b02.i[13]; - ((int * ALIGNED(64))a06)[11] = b03.i[13]; - ((int * ALIGNED(64))a06)[12] = b04.i[13]; - ((int * ALIGNED(64))a06)[13] = b05.i[13]; - ((int * ALIGNED(64))a06)[14] = b06.i[13]; - ((int * ALIGNED(64))a06)[15] = b07.i[13]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a07)[ 8] = b00.i[15]; - ((int * ALIGNED(64))a07)[ 9] = b01.i[15]; - ((int * ALIGNED(64))a07)[10] = b02.i[15]; - ((int * ALIGNED(64))a07)[11] = b03.i[15]; - ((int * ALIGNED(64))a07)[12] = b04.i[15]; - ((int * ALIGNED(64))a07)[13] = b05.i[15]; - ((int * ALIGNED(64))a07)[14] = b06.i[15]; - ((int * ALIGNED(64))a07)[15] = b07.i[15]; - } - - inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1]; - ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1]; - ((int * ALIGNED(64))a00)[10] = b02.i[ 1]; - ((int * ALIGNED(64))a00)[11] = b03.i[ 1]; - ((int * ALIGNED(64))a00)[12] = b04.i[ 1]; - ((int * ALIGNED(64))a00)[13] = b05.i[ 1]; - ((int * ALIGNED(64))a00)[14] = b06.i[ 1]; - ((int * ALIGNED(64))a00)[15] = b07.i[ 1]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3]; - ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3]; - ((int * ALIGNED(64))a01)[10] = b02.i[ 3]; - ((int * ALIGNED(64))a01)[11] = b03.i[ 3]; - ((int * ALIGNED(64))a01)[12] = b04.i[ 3]; - ((int * ALIGNED(64))a01)[13] = b05.i[ 3]; - ((int * ALIGNED(64))a01)[14] = b06.i[ 3]; - ((int * ALIGNED(64))a01)[15] = b07.i[ 3]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5]; - ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5]; - ((int * ALIGNED(64))a02)[10] = b02.i[ 5]; - ((int * ALIGNED(64))a02)[11] = b03.i[ 5]; - ((int * ALIGNED(64))a02)[12] = b04.i[ 5]; - ((int * ALIGNED(64))a02)[13] = b05.i[ 5]; - ((int * ALIGNED(64))a02)[14] = b06.i[ 5]; - ((int * ALIGNED(64))a02)[15] = b07.i[ 5]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7]; - ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7]; - ((int * ALIGNED(64))a03)[10] = b02.i[ 7]; - ((int * ALIGNED(64))a03)[11] = b03.i[ 7]; - ((int * ALIGNED(64))a03)[12] = b04.i[ 7]; - ((int * ALIGNED(64))a03)[13] = b05.i[ 7]; - ((int * ALIGNED(64))a03)[14] = b06.i[ 7]; - ((int * ALIGNED(64))a03)[15] = b07.i[ 7]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9]; - ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9]; - ((int * ALIGNED(64))a04)[10] = b02.i[ 9]; - ((int * ALIGNED(64))a04)[11] = b03.i[ 9]; - ((int * ALIGNED(64))a04)[12] = b04.i[ 9]; - ((int * ALIGNED(64))a04)[13] = b05.i[ 9]; - ((int * ALIGNED(64))a04)[14] = b06.i[ 9]; - ((int * ALIGNED(64))a04)[15] = b07.i[ 9]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a05)[ 8] = b00.i[11]; - ((int * ALIGNED(64))a05)[ 9] = b01.i[11]; - ((int * ALIGNED(64))a05)[10] = b02.i[11]; - ((int * ALIGNED(64))a05)[11] = b03.i[11]; - ((int * ALIGNED(64))a05)[12] = b04.i[11]; - ((int * ALIGNED(64))a05)[13] = b05.i[11]; - ((int * ALIGNED(64))a05)[14] = b06.i[11]; - ((int * ALIGNED(64))a05)[15] = b07.i[11]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a06)[ 8] = b00.i[13]; - ((int * ALIGNED(64))a06)[ 9] = b01.i[13]; - ((int * ALIGNED(64))a06)[10] = b02.i[13]; - ((int * ALIGNED(64))a06)[11] = b03.i[13]; - ((int * ALIGNED(64))a06)[12] = b04.i[13]; - ((int * ALIGNED(64))a06)[13] = b05.i[13]; - ((int * ALIGNED(64))a06)[14] = b06.i[13]; - ((int * ALIGNED(64))a06)[15] = b07.i[13]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a07)[ 8] = b00.i[15]; - ((int * ALIGNED(64))a07)[ 9] = b01.i[15]; - ((int * ALIGNED(64))a07)[10] = b02.i[15]; - ((int * ALIGNED(64))a07)[11] = b03.i[15]; - ((int * ALIGNED(64))a07)[12] = b04.i[15]; - ((int * ALIGNED(64))a07)[13] = b05.i[15]; - ((int * ALIGNED(64))a07)[14] = b06.i[15]; - ((int * ALIGNED(64))a07)[15] = b07.i[15]; - - ((int * ALIGNED(64))a08)[ 0] = b08.i[ 0]; - ((int * ALIGNED(64))a08)[ 1] = b09.i[ 0]; - ((int * ALIGNED(64))a08)[ 2] = b10.i[ 0]; - ((int * ALIGNED(64))a08)[ 3] = b11.i[ 0]; - ((int * ALIGNED(64))a08)[ 4] = b12.i[ 0]; - ((int * ALIGNED(64))a08)[ 5] = b13.i[ 0]; - ((int * ALIGNED(64))a08)[ 6] = b14.i[ 0]; - ((int * ALIGNED(64))a08)[ 7] = b15.i[ 0]; - ((int * ALIGNED(64))a08)[ 8] = b08.i[ 1]; - ((int * ALIGNED(64))a08)[ 9] = b09.i[ 1]; - ((int * ALIGNED(64))a08)[10] = b10.i[ 1]; - ((int * ALIGNED(64))a08)[11] = b11.i[ 1]; - ((int * ALIGNED(64))a08)[12] = b12.i[ 1]; - ((int * ALIGNED(64))a08)[13] = b13.i[ 1]; - ((int * ALIGNED(64))a08)[14] = b14.i[ 1]; - ((int * ALIGNED(64))a08)[15] = b15.i[ 1]; - - ((int * ALIGNED(64))a09)[ 0] = b08.i[ 2]; - ((int * ALIGNED(64))a09)[ 1] = b09.i[ 2]; - ((int * ALIGNED(64))a09)[ 2] = b10.i[ 2]; - ((int * ALIGNED(64))a09)[ 3] = b11.i[ 2]; - ((int * ALIGNED(64))a09)[ 4] = b12.i[ 2]; - ((int * ALIGNED(64))a09)[ 5] = b13.i[ 2]; - ((int * ALIGNED(64))a09)[ 6] = b14.i[ 2]; - ((int * ALIGNED(64))a09)[ 7] = b15.i[ 2]; - ((int * ALIGNED(64))a09)[ 8] = b08.i[ 3]; - ((int * ALIGNED(64))a09)[ 9] = b09.i[ 3]; - ((int * ALIGNED(64))a09)[10] = b10.i[ 3]; - ((int * ALIGNED(64))a09)[11] = b11.i[ 3]; - ((int * ALIGNED(64))a09)[12] = b12.i[ 3]; - ((int * ALIGNED(64))a09)[13] = b13.i[ 3]; - ((int * ALIGNED(64))a09)[14] = b14.i[ 3]; - ((int * ALIGNED(64))a09)[15] = b15.i[ 3]; - - ((int * ALIGNED(64))a10)[ 0] = b08.i[ 4]; - ((int * ALIGNED(64))a10)[ 1] = b09.i[ 4]; - ((int * ALIGNED(64))a10)[ 2] = b10.i[ 4]; - ((int * ALIGNED(64))a10)[ 3] = b11.i[ 4]; - ((int * ALIGNED(64))a10)[ 4] = b12.i[ 4]; - ((int * ALIGNED(64))a10)[ 5] = b13.i[ 4]; - ((int * ALIGNED(64))a10)[ 6] = b14.i[ 4]; - ((int * ALIGNED(64))a10)[ 7] = b15.i[ 4]; - ((int * ALIGNED(64))a10)[ 8] = b08.i[ 5]; - ((int * ALIGNED(64))a10)[ 9] = b09.i[ 5]; - ((int * ALIGNED(64))a10)[10] = b10.i[ 5]; - ((int * ALIGNED(64))a10)[11] = b11.i[ 5]; - ((int * ALIGNED(64))a10)[12] = b12.i[ 5]; - ((int * ALIGNED(64))a10)[13] = b13.i[ 5]; - ((int * ALIGNED(64))a10)[14] = b14.i[ 5]; - ((int * ALIGNED(64))a10)[15] = b15.i[ 5]; - - ((int * ALIGNED(64))a11)[ 0] = b08.i[ 6]; - ((int * ALIGNED(64))a11)[ 1] = b09.i[ 6]; - ((int * ALIGNED(64))a11)[ 2] = b10.i[ 6]; - ((int * ALIGNED(64))a11)[ 3] = b11.i[ 6]; - ((int * ALIGNED(64))a11)[ 4] = b12.i[ 6]; - ((int * ALIGNED(64))a11)[ 5] = b13.i[ 6]; - ((int * ALIGNED(64))a11)[ 6] = b14.i[ 6]; - ((int * ALIGNED(64))a11)[ 7] = b15.i[ 6]; - ((int * ALIGNED(64))a11)[ 8] = b08.i[ 7]; - ((int * ALIGNED(64))a11)[ 9] = b09.i[ 7]; - ((int * ALIGNED(64))a11)[10] = b10.i[ 7]; - ((int * ALIGNED(64))a11)[11] = b11.i[ 7]; - ((int * ALIGNED(64))a11)[12] = b12.i[ 7]; - ((int * ALIGNED(64))a11)[13] = b13.i[ 7]; - ((int * ALIGNED(64))a11)[14] = b14.i[ 7]; - ((int * ALIGNED(64))a11)[15] = b15.i[ 7]; - - ((int * ALIGNED(64))a12)[ 0] = b08.i[ 8]; - ((int * ALIGNED(64))a12)[ 1] = b09.i[ 8]; - ((int * ALIGNED(64))a12)[ 2] = b10.i[ 8]; - ((int * ALIGNED(64))a12)[ 3] = b11.i[ 8]; - ((int * ALIGNED(64))a12)[ 4] = b12.i[ 8]; - ((int * ALIGNED(64))a12)[ 5] = b13.i[ 8]; - ((int * ALIGNED(64))a12)[ 6] = b14.i[ 8]; - ((int * ALIGNED(64))a12)[ 7] = b15.i[ 8]; - ((int * ALIGNED(64))a12)[ 8] = b08.i[ 9]; - ((int * ALIGNED(64))a12)[ 9] = b09.i[ 9]; - ((int * ALIGNED(64))a12)[10] = b10.i[ 9]; - ((int * ALIGNED(64))a12)[11] = b11.i[ 9]; - ((int * ALIGNED(64))a12)[12] = b12.i[ 9]; - ((int * ALIGNED(64))a12)[13] = b13.i[ 9]; - ((int * ALIGNED(64))a12)[14] = b14.i[ 9]; - ((int * ALIGNED(64))a12)[15] = b15.i[ 9]; - - ((int * ALIGNED(64))a13)[ 0] = b08.i[10]; - ((int * ALIGNED(64))a13)[ 1] = b09.i[10]; - ((int * ALIGNED(64))a13)[ 2] = b10.i[10]; - ((int * ALIGNED(64))a13)[ 3] = b11.i[10]; - ((int * ALIGNED(64))a13)[ 4] = b12.i[10]; - ((int * ALIGNED(64))a13)[ 5] = b13.i[10]; - ((int * ALIGNED(64))a13)[ 6] = b14.i[10]; - ((int * ALIGNED(64))a13)[ 7] = b15.i[10]; - ((int * ALIGNED(64))a13)[ 8] = b08.i[11]; - ((int * ALIGNED(64))a13)[ 9] = b09.i[11]; - ((int * ALIGNED(64))a13)[10] = b10.i[11]; - ((int * ALIGNED(64))a13)[11] = b11.i[11]; - ((int * ALIGNED(64))a13)[12] = b12.i[11]; - ((int * ALIGNED(64))a13)[13] = b13.i[11]; - ((int * ALIGNED(64))a13)[14] = b14.i[11]; - ((int * ALIGNED(64))a13)[15] = b15.i[11]; - - ((int * ALIGNED(64))a14)[ 0] = b08.i[12]; - ((int * ALIGNED(64))a14)[ 1] = b09.i[12]; - ((int * ALIGNED(64))a14)[ 2] = b10.i[12]; - ((int * ALIGNED(64))a14)[ 3] = b11.i[12]; - ((int * ALIGNED(64))a14)[ 4] = b12.i[12]; - ((int * ALIGNED(64))a14)[ 5] = b13.i[12]; - ((int * ALIGNED(64))a14)[ 6] = b14.i[12]; - ((int * ALIGNED(64))a14)[ 7] = b15.i[12]; - ((int * ALIGNED(64))a14)[ 8] = b08.i[13]; - ((int * ALIGNED(64))a14)[ 9] = b09.i[13]; - ((int * ALIGNED(64))a14)[10] = b10.i[13]; - ((int * ALIGNED(64))a14)[11] = b11.i[13]; - ((int * ALIGNED(64))a14)[12] = b12.i[13]; - ((int * ALIGNED(64))a14)[13] = b13.i[13]; - ((int * ALIGNED(64))a14)[14] = b14.i[13]; - ((int * ALIGNED(64))a14)[15] = b15.i[13]; - - ((int * ALIGNED(64))a15)[ 0] = b08.i[14]; - ((int * ALIGNED(64))a15)[ 1] = b09.i[14]; - ((int * ALIGNED(64))a15)[ 2] = b10.i[14]; - ((int * ALIGNED(64))a15)[ 3] = b11.i[14]; - ((int * ALIGNED(64))a15)[ 4] = b12.i[14]; - ((int * ALIGNED(64))a15)[ 5] = b13.i[14]; - ((int * ALIGNED(64))a15)[ 6] = b14.i[14]; - ((int * ALIGNED(64))a15)[ 7] = b15.i[14]; - ((int * ALIGNED(64))a15)[ 8] = b08.i[15]; - ((int * ALIGNED(64))a15)[ 9] = b09.i[15]; - ((int * ALIGNED(64))a15)[10] = b10.i[15]; - ((int * ALIGNED(64))a15)[11] = b11.i[15]; - ((int * ALIGNED(64))a15)[12] = b12.i[15]; - ((int * ALIGNED(64))a15)[13] = b13.i[15]; - ((int * ALIGNED(64))a15)[14] = b14.i[15]; - ((int * ALIGNED(64))a15)[15] = b15.i[15]; - } - - ////////////// - // v16int class - - class v16int : public v16 - { + t = ( (int* ALIGNED( 64 ))a )[0]; + ( (int* ALIGNED( 64 ))a )[0] = ( (int* ALIGNED( 64 ))b )[0]; + ( (int* ALIGNED( 64 ))b )[0] = t; + + t = ( (int* ALIGNED( 64 ))a )[1]; + ( (int* ALIGNED( 64 ))a )[1] = ( (int* ALIGNED( 64 ))b )[1]; + ( (int* ALIGNED( 64 ))b )[1] = t; + + t = ( (int* ALIGNED( 64 ))a )[2]; + ( (int* ALIGNED( 64 ))a )[2] = ( (int* ALIGNED( 64 ))b )[2]; + ( (int* ALIGNED( 64 ))b )[2] = t; + + t = ( (int* ALIGNED( 64 ))a )[3]; + ( (int* ALIGNED( 64 ))a )[3] = ( (int* ALIGNED( 64 ))b )[3]; + ( (int* ALIGNED( 64 ))b )[3] = t; + + t = ( (int* ALIGNED( 64 ))a )[4]; + ( (int* ALIGNED( 64 ))a )[4] = ( (int* ALIGNED( 64 ))b )[4]; + ( (int* ALIGNED( 64 ))b )[4] = t; + + t = ( (int* ALIGNED( 64 ))a )[5]; + ( (int* ALIGNED( 64 ))a )[5] = ( (int* ALIGNED( 64 ))b )[5]; + ( (int* ALIGNED( 64 ))b )[5] = t; + + t = ( (int* ALIGNED( 64 ))a )[6]; + ( (int* ALIGNED( 64 ))a )[6] = ( (int* ALIGNED( 64 ))b )[6]; + ( (int* ALIGNED( 64 ))b )[6] = t; + + t = ( (int* ALIGNED( 64 ))a )[7]; + ( (int* ALIGNED( 64 ))a )[7] = ( (int* ALIGNED( 64 ))b )[7]; + ( (int* ALIGNED( 64 ))b )[7] = t; + + t = ( (int* ALIGNED( 64 ))a )[8]; + ( (int* ALIGNED( 64 ))a )[8] = ( (int* ALIGNED( 64 ))b )[8]; + ( (int* ALIGNED( 64 ))b )[8] = t; + + t = ( (int* ALIGNED( 64 ))a )[9]; + ( (int* ALIGNED( 64 ))a )[9] = ( (int* ALIGNED( 64 ))b )[9]; + ( (int* ALIGNED( 64 ))b )[9] = t; + + t = ( (int* ALIGNED( 64 ))a )[10]; + ( (int* ALIGNED( 64 ))a )[10] = ( (int* ALIGNED( 64 ))b )[10]; + ( (int* ALIGNED( 64 ))b )[10] = t; + + t = ( (int* ALIGNED( 64 ))a )[11]; + ( (int* ALIGNED( 64 ))a )[11] = ( (int* ALIGNED( 64 ))b )[11]; + ( (int* ALIGNED( 64 ))b )[11] = t; + + t = ( (int* ALIGNED( 64 ))a )[12]; + ( (int* ALIGNED( 64 ))a )[12] = ( (int* ALIGNED( 64 ))b )[12]; + ( (int* ALIGNED( 64 ))b )[12] = t; + + t = ( (int* ALIGNED( 64 ))a )[13]; + ( (int* ALIGNED( 64 ))a )[13] = ( (int* ALIGNED( 64 ))b )[13]; + ( (int* ALIGNED( 64 ))b )[13] = t; + + t = ( (int* ALIGNED( 64 ))a )[14]; + ( (int* ALIGNED( 64 ))a )[14] = ( (int* ALIGNED( 64 ))b )[14]; + ( (int* ALIGNED( 64 ))b )[14] = t; + + t = ( (int* ALIGNED( 64 ))a )[15]; + ( (int* ALIGNED( 64 ))a )[15] = ( (int* ALIGNED( 64 ))b )[15]; + ( (int* ALIGNED( 64 ))b )[15] = t; +} + +// v16 transposed memory manipulation functions + +inline void load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) +{ + a.i[0] = ( (const int*)a00 )[0]; + a.i[1] = ( (const int*)a01 )[0]; + a.i[2] = ( (const int*)a02 )[0]; + a.i[3] = ( (const int*)a03 )[0]; + a.i[4] = ( (const int*)a04 )[0]; + a.i[5] = ( (const int*)a05 )[0]; + a.i[6] = ( (const int*)a06 )[0]; + a.i[7] = ( (const int*)a07 )[0]; + a.i[8] = ( (const int*)a08 )[0]; + a.i[9] = ( (const int*)a09 )[0]; + a.i[10] = ( (const int*)a10 )[0]; + a.i[11] = ( (const int*)a11 )[0]; + a.i[12] = ( (const int*)a12 )[0]; + a.i[13] = ( (const int*)a13 )[0]; + a.i[14] = ( (const int*)a14 )[0]; + a.i[15] = ( (const int*)a15 )[0]; +} + +inline void +load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a00 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a01 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a02 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a03 )[1]; + + a.i[4] = ( (const int* ALIGNED( 8 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 8 ))a04 )[1]; + + a.i[5] = ( (const int* ALIGNED( 8 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 8 ))a05 )[1]; + + a.i[6] = ( (const int* ALIGNED( 8 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 8 ))a06 )[1]; + + a.i[7] = ( (const int* ALIGNED( 8 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 8 ))a07 )[1]; + + a.i[8] = ( (const int* ALIGNED( 8 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 8 ))a08 )[1]; + + a.i[9] = ( (const int* ALIGNED( 8 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 8 ))a09 )[1]; + + a.i[10] = ( (const int* ALIGNED( 8 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 8 ))a10 )[1]; + + a.i[11] = ( (const int* ALIGNED( 8 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 8 ))a11 )[1]; + + a.i[12] = ( (const int* ALIGNED( 8 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 8 ))a12 )[1]; + + a.i[13] = ( (const int* ALIGNED( 8 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 8 ))a13 )[1]; + + a.i[14] = ( (const int* ALIGNED( 8 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 8 ))a14 )[1]; + + a.i[15] = ( (const int* ALIGNED( 8 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 8 ))a15 )[1]; +} + +inline void +load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; +} + +inline void +load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + d.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + d.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + d.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + d.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + d.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + d.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + d.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + d.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + d.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + d.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + d.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + d.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + d.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + d.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + d.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + d.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; +} + +inline void +load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + d.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + e.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + f.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + g.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + h.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + d.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + e.i[1] = ( (const int* ALIGNED( 64 ))a01 )[4]; + f.i[1] = ( (const int* ALIGNED( 64 ))a01 )[5]; + g.i[1] = ( (const int* ALIGNED( 64 ))a01 )[6]; + h.i[1] = ( (const int* ALIGNED( 64 ))a01 )[7]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + d.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + e.i[2] = ( (const int* ALIGNED( 64 ))a02 )[4]; + f.i[2] = ( (const int* ALIGNED( 64 ))a02 )[5]; + g.i[2] = ( (const int* ALIGNED( 64 ))a02 )[6]; + h.i[2] = ( (const int* ALIGNED( 64 ))a02 )[7]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + d.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + e.i[3] = ( (const int* ALIGNED( 64 ))a03 )[4]; + f.i[3] = ( (const int* ALIGNED( 64 ))a03 )[5]; + g.i[3] = ( (const int* ALIGNED( 64 ))a03 )[6]; + h.i[3] = ( (const int* ALIGNED( 64 ))a03 )[7]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + d.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + e.i[4] = ( (const int* ALIGNED( 64 ))a04 )[4]; + f.i[4] = ( (const int* ALIGNED( 64 ))a04 )[5]; + g.i[4] = ( (const int* ALIGNED( 64 ))a04 )[6]; + h.i[4] = ( (const int* ALIGNED( 64 ))a04 )[7]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + d.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + e.i[5] = ( (const int* ALIGNED( 64 ))a05 )[4]; + f.i[5] = ( (const int* ALIGNED( 64 ))a05 )[5]; + g.i[5] = ( (const int* ALIGNED( 64 ))a05 )[6]; + h.i[5] = ( (const int* ALIGNED( 64 ))a05 )[7]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + d.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + e.i[6] = ( (const int* ALIGNED( 64 ))a06 )[4]; + f.i[6] = ( (const int* ALIGNED( 64 ))a06 )[5]; + g.i[6] = ( (const int* ALIGNED( 64 ))a06 )[6]; + h.i[6] = ( (const int* ALIGNED( 64 ))a06 )[7]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + d.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + e.i[7] = ( (const int* ALIGNED( 64 ))a07 )[4]; + f.i[7] = ( (const int* ALIGNED( 64 ))a07 )[5]; + g.i[7] = ( (const int* ALIGNED( 64 ))a07 )[6]; + h.i[7] = ( (const int* ALIGNED( 64 ))a07 )[7]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + d.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + e.i[8] = ( (const int* ALIGNED( 64 ))a08 )[4]; + f.i[8] = ( (const int* ALIGNED( 64 ))a08 )[5]; + g.i[8] = ( (const int* ALIGNED( 64 ))a08 )[6]; + h.i[8] = ( (const int* ALIGNED( 64 ))a08 )[7]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + d.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + e.i[9] = ( (const int* ALIGNED( 64 ))a09 )[4]; + f.i[9] = ( (const int* ALIGNED( 64 ))a09 )[5]; + g.i[9] = ( (const int* ALIGNED( 64 ))a09 )[6]; + h.i[9] = ( (const int* ALIGNED( 64 ))a09 )[7]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + d.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + e.i[10] = ( (const int* ALIGNED( 64 ))a10 )[4]; + f.i[10] = ( (const int* ALIGNED( 64 ))a10 )[5]; + g.i[10] = ( (const int* ALIGNED( 64 ))a10 )[6]; + h.i[10] = ( (const int* ALIGNED( 64 ))a10 )[7]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + d.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + e.i[11] = ( (const int* ALIGNED( 64 ))a11 )[4]; + f.i[11] = ( (const int* ALIGNED( 64 ))a11 )[5]; + g.i[11] = ( (const int* ALIGNED( 64 ))a11 )[6]; + h.i[11] = ( (const int* ALIGNED( 64 ))a11 )[7]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + d.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + e.i[12] = ( (const int* ALIGNED( 64 ))a12 )[4]; + f.i[12] = ( (const int* ALIGNED( 64 ))a12 )[5]; + g.i[12] = ( (const int* ALIGNED( 64 ))a12 )[6]; + h.i[12] = ( (const int* ALIGNED( 64 ))a12 )[7]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + d.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + e.i[13] = ( (const int* ALIGNED( 64 ))a13 )[4]; + f.i[13] = ( (const int* ALIGNED( 64 ))a13 )[5]; + g.i[13] = ( (const int* ALIGNED( 64 ))a13 )[6]; + h.i[13] = ( (const int* ALIGNED( 64 ))a13 )[7]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + d.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + e.i[14] = ( (const int* ALIGNED( 64 ))a14 )[4]; + f.i[14] = ( (const int* ALIGNED( 64 ))a14 )[5]; + g.i[14] = ( (const int* ALIGNED( 64 ))a14 )[6]; + h.i[14] = ( (const int* ALIGNED( 64 ))a14 )[7]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + d.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; + e.i[15] = ( (const int* ALIGNED( 64 ))a15 )[4]; + f.i[15] = ( (const int* ALIGNED( 64 ))a15 )[5]; + g.i[15] = ( (const int* ALIGNED( 64 ))a15 )[6]; + h.i[15] = ( (const int* ALIGNED( 64 ))a15 )[7]; +} + +inline void +load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b08.i[0] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b09.i[0] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b10.i[0] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b11.i[0] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b12.i[0] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b13.i[0] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b14.i[0] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b15.i[0] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b08.i[1] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b09.i[1] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b10.i[1] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b11.i[1] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b12.i[1] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b13.i[1] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b14.i[1] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b15.i[1] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b08.i[2] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b09.i[2] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b10.i[2] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b11.i[2] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b12.i[2] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b13.i[2] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b14.i[2] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b15.i[2] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b08.i[3] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b09.i[3] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b10.i[3] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b11.i[3] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b12.i[3] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b13.i[3] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b14.i[3] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b15.i[3] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b08.i[4] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b09.i[4] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b10.i[4] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b11.i[4] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b12.i[4] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b13.i[4] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b14.i[4] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b15.i[4] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b08.i[5] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b09.i[5] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b10.i[5] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b11.i[5] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b12.i[5] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b13.i[5] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b14.i[5] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b15.i[5] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b08.i[6] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b09.i[6] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b10.i[6] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b11.i[6] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b12.i[6] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b13.i[6] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b14.i[6] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b15.i[6] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b08.i[7] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b09.i[7] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b10.i[7] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b11.i[7] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b12.i[7] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b13.i[7] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b14.i[7] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b15.i[7] = ( (const int* ALIGNED( 64 ))a07 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a08 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a08 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a08 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a08 )[7]; + b08.i[8] = ( (const int* ALIGNED( 64 ))a08 )[8]; + b09.i[8] = ( (const int* ALIGNED( 64 ))a08 )[9]; + b10.i[8] = ( (const int* ALIGNED( 64 ))a08 )[10]; + b11.i[8] = ( (const int* ALIGNED( 64 ))a08 )[11]; + b12.i[8] = ( (const int* ALIGNED( 64 ))a08 )[12]; + b13.i[8] = ( (const int* ALIGNED( 64 ))a08 )[13]; + b14.i[8] = ( (const int* ALIGNED( 64 ))a08 )[14]; + b15.i[8] = ( (const int* ALIGNED( 64 ))a08 )[15]; + + b00.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a09 )[4]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a09 )[5]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a09 )[6]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a09 )[7]; + b08.i[9] = ( (const int* ALIGNED( 64 ))a09 )[8]; + b09.i[9] = ( (const int* ALIGNED( 64 ))a09 )[9]; + b10.i[9] = ( (const int* ALIGNED( 64 ))a09 )[10]; + b11.i[9] = ( (const int* ALIGNED( 64 ))a09 )[11]; + b12.i[9] = ( (const int* ALIGNED( 64 ))a09 )[12]; + b13.i[9] = ( (const int* ALIGNED( 64 ))a09 )[13]; + b14.i[9] = ( (const int* ALIGNED( 64 ))a09 )[14]; + b15.i[9] = ( (const int* ALIGNED( 64 ))a09 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a10 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a10 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a10 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a10 )[7]; + b08.i[10] = ( (const int* ALIGNED( 64 ))a10 )[8]; + b09.i[10] = ( (const int* ALIGNED( 64 ))a10 )[9]; + b10.i[10] = ( (const int* ALIGNED( 64 ))a10 )[10]; + b11.i[10] = ( (const int* ALIGNED( 64 ))a10 )[11]; + b12.i[10] = ( (const int* ALIGNED( 64 ))a10 )[12]; + b13.i[10] = ( (const int* ALIGNED( 64 ))a10 )[13]; + b14.i[10] = ( (const int* ALIGNED( 64 ))a10 )[14]; + b15.i[10] = ( (const int* ALIGNED( 64 ))a10 )[15]; + + b00.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a11 )[4]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a11 )[5]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a11 )[6]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a11 )[7]; + b08.i[11] = ( (const int* ALIGNED( 64 ))a11 )[8]; + b09.i[11] = ( (const int* ALIGNED( 64 ))a11 )[9]; + b10.i[11] = ( (const int* ALIGNED( 64 ))a11 )[10]; + b11.i[11] = ( (const int* ALIGNED( 64 ))a11 )[11]; + b12.i[11] = ( (const int* ALIGNED( 64 ))a11 )[12]; + b13.i[11] = ( (const int* ALIGNED( 64 ))a11 )[13]; + b14.i[11] = ( (const int* ALIGNED( 64 ))a11 )[14]; + b15.i[11] = ( (const int* ALIGNED( 64 ))a11 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a12 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a12 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a12 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a12 )[7]; + b08.i[12] = ( (const int* ALIGNED( 64 ))a12 )[8]; + b09.i[12] = ( (const int* ALIGNED( 64 ))a12 )[9]; + b10.i[12] = ( (const int* ALIGNED( 64 ))a12 )[10]; + b11.i[12] = ( (const int* ALIGNED( 64 ))a12 )[11]; + b12.i[12] = ( (const int* ALIGNED( 64 ))a12 )[12]; + b13.i[12] = ( (const int* ALIGNED( 64 ))a12 )[13]; + b14.i[12] = ( (const int* ALIGNED( 64 ))a12 )[14]; + b15.i[12] = ( (const int* ALIGNED( 64 ))a12 )[15]; + + b00.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a13 )[4]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a13 )[5]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a13 )[6]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a13 )[7]; + b08.i[13] = ( (const int* ALIGNED( 64 ))a13 )[8]; + b09.i[13] = ( (const int* ALIGNED( 64 ))a13 )[9]; + b10.i[13] = ( (const int* ALIGNED( 64 ))a13 )[10]; + b11.i[13] = ( (const int* ALIGNED( 64 ))a13 )[11]; + b12.i[13] = ( (const int* ALIGNED( 64 ))a13 )[12]; + b13.i[13] = ( (const int* ALIGNED( 64 ))a13 )[13]; + b14.i[13] = ( (const int* ALIGNED( 64 ))a13 )[14]; + b15.i[13] = ( (const int* ALIGNED( 64 ))a13 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a14 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a14 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a14 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a14 )[7]; + b08.i[14] = ( (const int* ALIGNED( 64 ))a14 )[8]; + b09.i[14] = ( (const int* ALIGNED( 64 ))a14 )[9]; + b10.i[14] = ( (const int* ALIGNED( 64 ))a14 )[10]; + b11.i[14] = ( (const int* ALIGNED( 64 ))a14 )[11]; + b12.i[14] = ( (const int* ALIGNED( 64 ))a14 )[12]; + b13.i[14] = ( (const int* ALIGNED( 64 ))a14 )[13]; + b14.i[14] = ( (const int* ALIGNED( 64 ))a14 )[14]; + b15.i[14] = ( (const int* ALIGNED( 64 ))a14 )[15]; + + b00.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a15 )[4]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a15 )[5]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a15 )[6]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a15 )[7]; + b08.i[15] = ( (const int* ALIGNED( 64 ))a15 )[8]; + b09.i[15] = ( (const int* ALIGNED( 64 ))a15 )[9]; + b10.i[15] = ( (const int* ALIGNED( 64 ))a15 )[10]; + b11.i[15] = ( (const int* ALIGNED( 64 ))a15 )[11]; + b12.i[15] = ( (const int* ALIGNED( 64 ))a15 )[12]; + b13.i[15] = ( (const int* ALIGNED( 64 ))a15 )[13]; + b14.i[15] = ( (const int* ALIGNED( 64 ))a15 )[14]; + b15.i[15] = ( (const int* ALIGNED( 64 ))a15 )[15]; +} + +inline void +load_16x8_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b00.i[1] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b00.i[3] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b00.i[5] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b00.i[7] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b00.i[9] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b00.i[11] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b00.i[13] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b00.i[15] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a07 )[15]; +} + +inline void +load_16x16_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b00.i[1] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b00.i[3] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b00.i[5] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b00.i[7] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b00.i[9] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b00.i[11] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b00.i[13] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b00.i[15] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a07 )[15]; + + b08.i[0] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b09.i[0] = ( (const int* ALIGNED( 64 ))a08 )[1]; + b10.i[0] = ( (const int* ALIGNED( 64 ))a08 )[2]; + b11.i[0] = ( (const int* ALIGNED( 64 ))a08 )[3]; + b12.i[0] = ( (const int* ALIGNED( 64 ))a08 )[4]; + b13.i[0] = ( (const int* ALIGNED( 64 ))a08 )[5]; + b14.i[0] = ( (const int* ALIGNED( 64 ))a08 )[6]; + b15.i[0] = ( (const int* ALIGNED( 64 ))a08 )[7]; + b08.i[1] = ( (const int* ALIGNED( 64 ))a08 )[8]; + b09.i[1] = ( (const int* ALIGNED( 64 ))a08 )[9]; + b10.i[1] = ( (const int* ALIGNED( 64 ))a08 )[10]; + b11.i[1] = ( (const int* ALIGNED( 64 ))a08 )[11]; + b12.i[1] = ( (const int* ALIGNED( 64 ))a08 )[12]; + b13.i[1] = ( (const int* ALIGNED( 64 ))a08 )[13]; + b14.i[1] = ( (const int* ALIGNED( 64 ))a08 )[14]; + b15.i[1] = ( (const int* ALIGNED( 64 ))a08 )[15]; + + b08.i[2] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b09.i[2] = ( (const int* ALIGNED( 64 ))a09 )[1]; + b10.i[2] = ( (const int* ALIGNED( 64 ))a09 )[2]; + b11.i[2] = ( (const int* ALIGNED( 64 ))a09 )[3]; + b12.i[2] = ( (const int* ALIGNED( 64 ))a09 )[4]; + b13.i[2] = ( (const int* ALIGNED( 64 ))a09 )[5]; + b14.i[2] = ( (const int* ALIGNED( 64 ))a09 )[6]; + b15.i[2] = ( (const int* ALIGNED( 64 ))a09 )[7]; + b08.i[3] = ( (const int* ALIGNED( 64 ))a09 )[8]; + b09.i[3] = ( (const int* ALIGNED( 64 ))a09 )[9]; + b10.i[3] = ( (const int* ALIGNED( 64 ))a09 )[10]; + b11.i[3] = ( (const int* ALIGNED( 64 ))a09 )[11]; + b12.i[3] = ( (const int* ALIGNED( 64 ))a09 )[12]; + b13.i[3] = ( (const int* ALIGNED( 64 ))a09 )[13]; + b14.i[3] = ( (const int* ALIGNED( 64 ))a09 )[14]; + b15.i[3] = ( (const int* ALIGNED( 64 ))a09 )[15]; + + b08.i[4] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b09.i[4] = ( (const int* ALIGNED( 64 ))a10 )[1]; + b10.i[4] = ( (const int* ALIGNED( 64 ))a10 )[2]; + b11.i[4] = ( (const int* ALIGNED( 64 ))a10 )[3]; + b12.i[4] = ( (const int* ALIGNED( 64 ))a10 )[4]; + b13.i[4] = ( (const int* ALIGNED( 64 ))a10 )[5]; + b14.i[4] = ( (const int* ALIGNED( 64 ))a10 )[6]; + b15.i[4] = ( (const int* ALIGNED( 64 ))a10 )[7]; + b08.i[5] = ( (const int* ALIGNED( 64 ))a10 )[8]; + b09.i[5] = ( (const int* ALIGNED( 64 ))a10 )[9]; + b10.i[5] = ( (const int* ALIGNED( 64 ))a10 )[10]; + b11.i[5] = ( (const int* ALIGNED( 64 ))a10 )[11]; + b12.i[5] = ( (const int* ALIGNED( 64 ))a10 )[12]; + b13.i[5] = ( (const int* ALIGNED( 64 ))a10 )[13]; + b14.i[5] = ( (const int* ALIGNED( 64 ))a10 )[14]; + b15.i[5] = ( (const int* ALIGNED( 64 ))a10 )[15]; + + b08.i[6] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b09.i[6] = ( (const int* ALIGNED( 64 ))a11 )[1]; + b10.i[6] = ( (const int* ALIGNED( 64 ))a11 )[2]; + b11.i[6] = ( (const int* ALIGNED( 64 ))a11 )[3]; + b12.i[6] = ( (const int* ALIGNED( 64 ))a11 )[4]; + b13.i[6] = ( (const int* ALIGNED( 64 ))a11 )[5]; + b14.i[6] = ( (const int* ALIGNED( 64 ))a11 )[6]; + b15.i[6] = ( (const int* ALIGNED( 64 ))a11 )[7]; + b08.i[7] = ( (const int* ALIGNED( 64 ))a11 )[8]; + b09.i[7] = ( (const int* ALIGNED( 64 ))a11 )[9]; + b10.i[7] = ( (const int* ALIGNED( 64 ))a11 )[10]; + b11.i[7] = ( (const int* ALIGNED( 64 ))a11 )[11]; + b12.i[7] = ( (const int* ALIGNED( 64 ))a11 )[12]; + b13.i[7] = ( (const int* ALIGNED( 64 ))a11 )[13]; + b14.i[7] = ( (const int* ALIGNED( 64 ))a11 )[14]; + b15.i[7] = ( (const int* ALIGNED( 64 ))a11 )[15]; + + b08.i[8] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b09.i[8] = ( (const int* ALIGNED( 64 ))a12 )[1]; + b10.i[8] = ( (const int* ALIGNED( 64 ))a12 )[2]; + b11.i[8] = ( (const int* ALIGNED( 64 ))a12 )[3]; + b12.i[8] = ( (const int* ALIGNED( 64 ))a12 )[4]; + b13.i[8] = ( (const int* ALIGNED( 64 ))a12 )[5]; + b14.i[8] = ( (const int* ALIGNED( 64 ))a12 )[6]; + b15.i[8] = ( (const int* ALIGNED( 64 ))a12 )[7]; + b08.i[9] = ( (const int* ALIGNED( 64 ))a12 )[8]; + b09.i[9] = ( (const int* ALIGNED( 64 ))a12 )[9]; + b10.i[9] = ( (const int* ALIGNED( 64 ))a12 )[10]; + b11.i[9] = ( (const int* ALIGNED( 64 ))a12 )[11]; + b12.i[9] = ( (const int* ALIGNED( 64 ))a12 )[12]; + b13.i[9] = ( (const int* ALIGNED( 64 ))a12 )[13]; + b14.i[9] = ( (const int* ALIGNED( 64 ))a12 )[14]; + b15.i[9] = ( (const int* ALIGNED( 64 ))a12 )[15]; + + b08.i[10] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b09.i[10] = ( (const int* ALIGNED( 64 ))a13 )[1]; + b10.i[10] = ( (const int* ALIGNED( 64 ))a13 )[2]; + b11.i[10] = ( (const int* ALIGNED( 64 ))a13 )[3]; + b12.i[10] = ( (const int* ALIGNED( 64 ))a13 )[4]; + b13.i[10] = ( (const int* ALIGNED( 64 ))a13 )[5]; + b14.i[10] = ( (const int* ALIGNED( 64 ))a13 )[6]; + b15.i[10] = ( (const int* ALIGNED( 64 ))a13 )[7]; + b08.i[11] = ( (const int* ALIGNED( 64 ))a13 )[8]; + b09.i[11] = ( (const int* ALIGNED( 64 ))a13 )[9]; + b10.i[11] = ( (const int* ALIGNED( 64 ))a13 )[10]; + b11.i[11] = ( (const int* ALIGNED( 64 ))a13 )[11]; + b12.i[11] = ( (const int* ALIGNED( 64 ))a13 )[12]; + b13.i[11] = ( (const int* ALIGNED( 64 ))a13 )[13]; + b14.i[11] = ( (const int* ALIGNED( 64 ))a13 )[14]; + b15.i[11] = ( (const int* ALIGNED( 64 ))a13 )[15]; + + b08.i[12] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b09.i[12] = ( (const int* ALIGNED( 64 ))a14 )[1]; + b10.i[12] = ( (const int* ALIGNED( 64 ))a14 )[2]; + b11.i[12] = ( (const int* ALIGNED( 64 ))a14 )[3]; + b12.i[12] = ( (const int* ALIGNED( 64 ))a14 )[4]; + b13.i[12] = ( (const int* ALIGNED( 64 ))a14 )[5]; + b14.i[12] = ( (const int* ALIGNED( 64 ))a14 )[6]; + b15.i[12] = ( (const int* ALIGNED( 64 ))a14 )[7]; + b08.i[13] = ( (const int* ALIGNED( 64 ))a14 )[8]; + b09.i[13] = ( (const int* ALIGNED( 64 ))a14 )[9]; + b10.i[13] = ( (const int* ALIGNED( 64 ))a14 )[10]; + b11.i[13] = ( (const int* ALIGNED( 64 ))a14 )[11]; + b12.i[13] = ( (const int* ALIGNED( 64 ))a14 )[12]; + b13.i[13] = ( (const int* ALIGNED( 64 ))a14 )[13]; + b14.i[13] = ( (const int* ALIGNED( 64 ))a14 )[14]; + b15.i[13] = ( (const int* ALIGNED( 64 ))a14 )[15]; + + b08.i[14] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b09.i[14] = ( (const int* ALIGNED( 64 ))a15 )[1]; + b10.i[14] = ( (const int* ALIGNED( 64 ))a15 )[2]; + b11.i[14] = ( (const int* ALIGNED( 64 ))a15 )[3]; + b12.i[14] = ( (const int* ALIGNED( 64 ))a15 )[4]; + b13.i[14] = ( (const int* ALIGNED( 64 ))a15 )[5]; + b14.i[14] = ( (const int* ALIGNED( 64 ))a15 )[6]; + b15.i[14] = ( (const int* ALIGNED( 64 ))a15 )[7]; + b08.i[15] = ( (const int* ALIGNED( 64 ))a15 )[8]; + b09.i[15] = ( (const int* ALIGNED( 64 ))a15 )[9]; + b10.i[15] = ( (const int* ALIGNED( 64 ))a15 )[10]; + b11.i[15] = ( (const int* ALIGNED( 64 ))a15 )[11]; + b12.i[15] = ( (const int* ALIGNED( 64 ))a15 )[12]; + b13.i[15] = ( (const int* ALIGNED( 64 ))a15 )[13]; + b14.i[15] = ( (const int* ALIGNED( 64 ))a15 )[14]; + b15.i[15] = ( (const int* ALIGNED( 64 ))a15 )[15]; +} + +inline void store_16x1_tr( const v16& a, void* a00, void* a01, void* a02, + void* a03, void* a04, void* a05, void* a06, + void* a07, void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, void* a14, + void* a15 ) +{ + ( (int*)a00 )[0] = a.i[0]; + ( (int*)a01 )[0] = a.i[1]; + ( (int*)a02 )[0] = a.i[2]; + ( (int*)a03 )[0] = a.i[3]; + ( (int*)a04 )[0] = a.i[4]; + ( (int*)a05 )[0] = a.i[5]; + ( (int*)a06 )[0] = a.i[6]; + ( (int*)a07 )[0] = a.i[7]; + ( (int*)a08 )[0] = a.i[8]; + ( (int*)a09 )[0] = a.i[9]; + ( (int*)a10 )[0] = a.i[10]; + ( (int*)a11 )[0] = a.i[11]; + ( (int*)a12 )[0] = a.i[12]; + ( (int*)a13 )[0] = a.i[13]; + ( (int*)a14 )[0] = a.i[14]; + ( (int*)a15 )[0] = a.i[15]; +} + +inline void store_16x2_tr( const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, + void* ALIGNED( 8 ) a03, void* ALIGNED( 8 ) a04, + void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, + void* ALIGNED( 8 ) a09, void* ALIGNED( 8 ) a10, + void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) +{ + ( (int* ALIGNED( 8 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a00 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a01 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a02 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a03 )[1] = b.i[3]; + + ( (int* ALIGNED( 8 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a04 )[1] = b.i[4]; + + ( (int* ALIGNED( 8 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a05 )[1] = b.i[5]; + + ( (int* ALIGNED( 8 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a06 )[1] = b.i[6]; + + ( (int* ALIGNED( 8 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a07 )[1] = b.i[7]; + + ( (int* ALIGNED( 8 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 8 ))a08 )[1] = b.i[8]; + + ( (int* ALIGNED( 8 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 8 ))a09 )[1] = b.i[9]; + + ( (int* ALIGNED( 8 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 8 ))a10 )[1] = b.i[10]; + + ( (int* ALIGNED( 8 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 8 ))a11 )[1] = b.i[11]; + + ( (int* ALIGNED( 8 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 8 ))a12 )[1] = b.i[12]; + + ( (int* ALIGNED( 8 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 8 ))a13 )[1] = b.i[13]; + + ( (int* ALIGNED( 8 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 8 ))a14 )[1] = b.i[14]; + + ( (int* ALIGNED( 8 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 8 ))a15 )[1] = b.i[15]; +} + +inline void store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; +} + +inline void store_16x4_tr( const v16& a, const v16& b, const v16& c, + const v16& d, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, + void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, + void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; +} + +inline void store_16x8_tr( + const v16& a, const v16& b, const v16& c, const v16& d, const v16& e, + const v16& f, const v16& g, const v16& h, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = e.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = f.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = g.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = h.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = e.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = f.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = g.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = h.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = e.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = f.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = g.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = h.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = e.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = f.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = g.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = h.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = e.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = f.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = g.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = h.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = e.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = f.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = g.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = h.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = e.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = f.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = g.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = h.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = e.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = f.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = g.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = h.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = e.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = f.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = g.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = h.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = e.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = f.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = g.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = h.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = e.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = f.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = g.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = h.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = e.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = f.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = g.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = h.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = e.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = f.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = g.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = h.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = e.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = f.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = g.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = h.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = e.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = f.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = g.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = h.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = e.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = f.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = g.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = h.i[15]; +} + +inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b08.i[0]; + ( (int* ALIGNED( 64 ))a00 )[9] = b09.i[0]; + ( (int* ALIGNED( 64 ))a00 )[10] = b10.i[0]; + ( (int* ALIGNED( 64 ))a00 )[11] = b11.i[0]; + ( (int* ALIGNED( 64 ))a00 )[12] = b12.i[0]; + ( (int* ALIGNED( 64 ))a00 )[13] = b13.i[0]; + ( (int* ALIGNED( 64 ))a00 )[14] = b14.i[0]; + ( (int* ALIGNED( 64 ))a00 )[15] = b15.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[1]; + ( (int* ALIGNED( 64 ))a01 )[8] = b08.i[1]; + ( (int* ALIGNED( 64 ))a01 )[9] = b09.i[1]; + ( (int* ALIGNED( 64 ))a01 )[10] = b10.i[1]; + ( (int* ALIGNED( 64 ))a01 )[11] = b11.i[1]; + ( (int* ALIGNED( 64 ))a01 )[12] = b12.i[1]; + ( (int* ALIGNED( 64 ))a01 )[13] = b13.i[1]; + ( (int* ALIGNED( 64 ))a01 )[14] = b14.i[1]; + ( (int* ALIGNED( 64 ))a01 )[15] = b15.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a02 )[8] = b08.i[2]; + ( (int* ALIGNED( 64 ))a02 )[9] = b09.i[2]; + ( (int* ALIGNED( 64 ))a02 )[10] = b10.i[2]; + ( (int* ALIGNED( 64 ))a02 )[11] = b11.i[2]; + ( (int* ALIGNED( 64 ))a02 )[12] = b12.i[2]; + ( (int* ALIGNED( 64 ))a02 )[13] = b13.i[2]; + ( (int* ALIGNED( 64 ))a02 )[14] = b14.i[2]; + ( (int* ALIGNED( 64 ))a02 )[15] = b15.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[3]; + ( (int* ALIGNED( 64 ))a03 )[8] = b08.i[3]; + ( (int* ALIGNED( 64 ))a03 )[9] = b09.i[3]; + ( (int* ALIGNED( 64 ))a03 )[10] = b10.i[3]; + ( (int* ALIGNED( 64 ))a03 )[11] = b11.i[3]; + ( (int* ALIGNED( 64 ))a03 )[12] = b12.i[3]; + ( (int* ALIGNED( 64 ))a03 )[13] = b13.i[3]; + ( (int* ALIGNED( 64 ))a03 )[14] = b14.i[3]; + ( (int* ALIGNED( 64 ))a03 )[15] = b15.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a04 )[8] = b08.i[4]; + ( (int* ALIGNED( 64 ))a04 )[9] = b09.i[4]; + ( (int* ALIGNED( 64 ))a04 )[10] = b10.i[4]; + ( (int* ALIGNED( 64 ))a04 )[11] = b11.i[4]; + ( (int* ALIGNED( 64 ))a04 )[12] = b12.i[4]; + ( (int* ALIGNED( 64 ))a04 )[13] = b13.i[4]; + ( (int* ALIGNED( 64 ))a04 )[14] = b14.i[4]; + ( (int* ALIGNED( 64 ))a04 )[15] = b15.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[5]; + ( (int* ALIGNED( 64 ))a05 )[8] = b08.i[5]; + ( (int* ALIGNED( 64 ))a05 )[9] = b09.i[5]; + ( (int* ALIGNED( 64 ))a05 )[10] = b10.i[5]; + ( (int* ALIGNED( 64 ))a05 )[11] = b11.i[5]; + ( (int* ALIGNED( 64 ))a05 )[12] = b12.i[5]; + ( (int* ALIGNED( 64 ))a05 )[13] = b13.i[5]; + ( (int* ALIGNED( 64 ))a05 )[14] = b14.i[5]; + ( (int* ALIGNED( 64 ))a05 )[15] = b15.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a06 )[8] = b08.i[6]; + ( (int* ALIGNED( 64 ))a06 )[9] = b09.i[6]; + ( (int* ALIGNED( 64 ))a06 )[10] = b10.i[6]; + ( (int* ALIGNED( 64 ))a06 )[11] = b11.i[6]; + ( (int* ALIGNED( 64 ))a06 )[12] = b12.i[6]; + ( (int* ALIGNED( 64 ))a06 )[13] = b13.i[6]; + ( (int* ALIGNED( 64 ))a06 )[14] = b14.i[6]; + ( (int* ALIGNED( 64 ))a06 )[15] = b15.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[7]; + ( (int* ALIGNED( 64 ))a07 )[8] = b08.i[7]; + ( (int* ALIGNED( 64 ))a07 )[9] = b09.i[7]; + ( (int* ALIGNED( 64 ))a07 )[10] = b10.i[7]; + ( (int* ALIGNED( 64 ))a07 )[11] = b11.i[7]; + ( (int* ALIGNED( 64 ))a07 )[12] = b12.i[7]; + ( (int* ALIGNED( 64 ))a07 )[13] = b13.i[7]; + ( (int* ALIGNED( 64 ))a07 )[14] = b14.i[7]; + ( (int* ALIGNED( 64 ))a07 )[15] = b15.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a08 )[8] = b08.i[8]; + ( (int* ALIGNED( 64 ))a08 )[9] = b09.i[8]; + ( (int* ALIGNED( 64 ))a08 )[10] = b10.i[8]; + ( (int* ALIGNED( 64 ))a08 )[11] = b11.i[8]; + ( (int* ALIGNED( 64 ))a08 )[12] = b12.i[8]; + ( (int* ALIGNED( 64 ))a08 )[13] = b13.i[8]; + ( (int* ALIGNED( 64 ))a08 )[14] = b14.i[8]; + ( (int* ALIGNED( 64 ))a08 )[15] = b15.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = b00.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b01.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = b02.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = b03.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = b04.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = b05.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = b06.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = b07.i[9]; + ( (int* ALIGNED( 64 ))a09 )[8] = b08.i[9]; + ( (int* ALIGNED( 64 ))a09 )[9] = b09.i[9]; + ( (int* ALIGNED( 64 ))a09 )[10] = b10.i[9]; + ( (int* ALIGNED( 64 ))a09 )[11] = b11.i[9]; + ( (int* ALIGNED( 64 ))a09 )[12] = b12.i[9]; + ( (int* ALIGNED( 64 ))a09 )[13] = b13.i[9]; + ( (int* ALIGNED( 64 ))a09 )[14] = b14.i[9]; + ( (int* ALIGNED( 64 ))a09 )[15] = b15.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a10 )[8] = b08.i[10]; + ( (int* ALIGNED( 64 ))a10 )[9] = b09.i[10]; + ( (int* ALIGNED( 64 ))a10 )[10] = b10.i[10]; + ( (int* ALIGNED( 64 ))a10 )[11] = b11.i[10]; + ( (int* ALIGNED( 64 ))a10 )[12] = b12.i[10]; + ( (int* ALIGNED( 64 ))a10 )[13] = b13.i[10]; + ( (int* ALIGNED( 64 ))a10 )[14] = b14.i[10]; + ( (int* ALIGNED( 64 ))a10 )[15] = b15.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = b00.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b01.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = b02.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = b03.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = b04.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = b05.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = b06.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = b07.i[11]; + ( (int* ALIGNED( 64 ))a11 )[8] = b08.i[11]; + ( (int* ALIGNED( 64 ))a11 )[9] = b09.i[11]; + ( (int* ALIGNED( 64 ))a11 )[10] = b10.i[11]; + ( (int* ALIGNED( 64 ))a11 )[11] = b11.i[11]; + ( (int* ALIGNED( 64 ))a11 )[12] = b12.i[11]; + ( (int* ALIGNED( 64 ))a11 )[13] = b13.i[11]; + ( (int* ALIGNED( 64 ))a11 )[14] = b14.i[11]; + ( (int* ALIGNED( 64 ))a11 )[15] = b15.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a12 )[8] = b08.i[12]; + ( (int* ALIGNED( 64 ))a12 )[9] = b09.i[12]; + ( (int* ALIGNED( 64 ))a12 )[10] = b10.i[12]; + ( (int* ALIGNED( 64 ))a12 )[11] = b11.i[12]; + ( (int* ALIGNED( 64 ))a12 )[12] = b12.i[12]; + ( (int* ALIGNED( 64 ))a12 )[13] = b13.i[12]; + ( (int* ALIGNED( 64 ))a12 )[14] = b14.i[12]; + ( (int* ALIGNED( 64 ))a12 )[15] = b15.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = b00.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b01.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = b02.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = b03.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = b04.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = b05.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = b06.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = b07.i[13]; + ( (int* ALIGNED( 64 ))a13 )[8] = b08.i[13]; + ( (int* ALIGNED( 64 ))a13 )[9] = b09.i[13]; + ( (int* ALIGNED( 64 ))a13 )[10] = b10.i[13]; + ( (int* ALIGNED( 64 ))a13 )[11] = b11.i[13]; + ( (int* ALIGNED( 64 ))a13 )[12] = b12.i[13]; + ( (int* ALIGNED( 64 ))a13 )[13] = b13.i[13]; + ( (int* ALIGNED( 64 ))a13 )[14] = b14.i[13]; + ( (int* ALIGNED( 64 ))a13 )[15] = b15.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a14 )[8] = b08.i[14]; + ( (int* ALIGNED( 64 ))a14 )[9] = b09.i[14]; + ( (int* ALIGNED( 64 ))a14 )[10] = b10.i[14]; + ( (int* ALIGNED( 64 ))a14 )[11] = b11.i[14]; + ( (int* ALIGNED( 64 ))a14 )[12] = b12.i[14]; + ( (int* ALIGNED( 64 ))a14 )[13] = b13.i[14]; + ( (int* ALIGNED( 64 ))a14 )[14] = b14.i[14]; + ( (int* ALIGNED( 64 ))a14 )[15] = b15.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = b00.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b01.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = b02.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = b03.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = b04.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = b05.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = b06.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = b07.i[15]; + ( (int* ALIGNED( 64 ))a15 )[8] = b08.i[15]; + ( (int* ALIGNED( 64 ))a15 )[9] = b09.i[15]; + ( (int* ALIGNED( 64 ))a15 )[10] = b10.i[15]; + ( (int* ALIGNED( 64 ))a15 )[11] = b11.i[15]; + ( (int* ALIGNED( 64 ))a15 )[12] = b12.i[15]; + ( (int* ALIGNED( 64 ))a15 )[13] = b13.i[15]; + ( (int* ALIGNED( 64 ))a15 )[14] = b14.i[15]; + ( (int* ALIGNED( 64 ))a15 )[15] = b15.i[15]; +} + +inline void store_16x8_tr_p( const v16& b00, const v16& b01, const v16& b02, + const v16& b03, const v16& b04, const v16& b05, + const v16& b06, const v16& b07, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b00.i[1]; + ( (int* ALIGNED( 64 ))a00 )[9] = b01.i[1]; + ( (int* ALIGNED( 64 ))a00 )[10] = b02.i[1]; + ( (int* ALIGNED( 64 ))a00 )[11] = b03.i[1]; + ( (int* ALIGNED( 64 ))a00 )[12] = b04.i[1]; + ( (int* ALIGNED( 64 ))a00 )[13] = b05.i[1]; + ( (int* ALIGNED( 64 ))a00 )[14] = b06.i[1]; + ( (int* ALIGNED( 64 ))a00 )[15] = b07.i[1]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a01 )[8] = b00.i[3]; + ( (int* ALIGNED( 64 ))a01 )[9] = b01.i[3]; + ( (int* ALIGNED( 64 ))a01 )[10] = b02.i[3]; + ( (int* ALIGNED( 64 ))a01 )[11] = b03.i[3]; + ( (int* ALIGNED( 64 ))a01 )[12] = b04.i[3]; + ( (int* ALIGNED( 64 ))a01 )[13] = b05.i[3]; + ( (int* ALIGNED( 64 ))a01 )[14] = b06.i[3]; + ( (int* ALIGNED( 64 ))a01 )[15] = b07.i[3]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a02 )[8] = b00.i[5]; + ( (int* ALIGNED( 64 ))a02 )[9] = b01.i[5]; + ( (int* ALIGNED( 64 ))a02 )[10] = b02.i[5]; + ( (int* ALIGNED( 64 ))a02 )[11] = b03.i[5]; + ( (int* ALIGNED( 64 ))a02 )[12] = b04.i[5]; + ( (int* ALIGNED( 64 ))a02 )[13] = b05.i[5]; + ( (int* ALIGNED( 64 ))a02 )[14] = b06.i[5]; + ( (int* ALIGNED( 64 ))a02 )[15] = b07.i[5]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a03 )[8] = b00.i[7]; + ( (int* ALIGNED( 64 ))a03 )[9] = b01.i[7]; + ( (int* ALIGNED( 64 ))a03 )[10] = b02.i[7]; + ( (int* ALIGNED( 64 ))a03 )[11] = b03.i[7]; + ( (int* ALIGNED( 64 ))a03 )[12] = b04.i[7]; + ( (int* ALIGNED( 64 ))a03 )[13] = b05.i[7]; + ( (int* ALIGNED( 64 ))a03 )[14] = b06.i[7]; + ( (int* ALIGNED( 64 ))a03 )[15] = b07.i[7]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a04 )[8] = b00.i[9]; + ( (int* ALIGNED( 64 ))a04 )[9] = b01.i[9]; + ( (int* ALIGNED( 64 ))a04 )[10] = b02.i[9]; + ( (int* ALIGNED( 64 ))a04 )[11] = b03.i[9]; + ( (int* ALIGNED( 64 ))a04 )[12] = b04.i[9]; + ( (int* ALIGNED( 64 ))a04 )[13] = b05.i[9]; + ( (int* ALIGNED( 64 ))a04 )[14] = b06.i[9]; + ( (int* ALIGNED( 64 ))a04 )[15] = b07.i[9]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a05 )[8] = b00.i[11]; + ( (int* ALIGNED( 64 ))a05 )[9] = b01.i[11]; + ( (int* ALIGNED( 64 ))a05 )[10] = b02.i[11]; + ( (int* ALIGNED( 64 ))a05 )[11] = b03.i[11]; + ( (int* ALIGNED( 64 ))a05 )[12] = b04.i[11]; + ( (int* ALIGNED( 64 ))a05 )[13] = b05.i[11]; + ( (int* ALIGNED( 64 ))a05 )[14] = b06.i[11]; + ( (int* ALIGNED( 64 ))a05 )[15] = b07.i[11]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a06 )[8] = b00.i[13]; + ( (int* ALIGNED( 64 ))a06 )[9] = b01.i[13]; + ( (int* ALIGNED( 64 ))a06 )[10] = b02.i[13]; + ( (int* ALIGNED( 64 ))a06 )[11] = b03.i[13]; + ( (int* ALIGNED( 64 ))a06 )[12] = b04.i[13]; + ( (int* ALIGNED( 64 ))a06 )[13] = b05.i[13]; + ( (int* ALIGNED( 64 ))a06 )[14] = b06.i[13]; + ( (int* ALIGNED( 64 ))a06 )[15] = b07.i[13]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a07 )[8] = b00.i[15]; + ( (int* ALIGNED( 64 ))a07 )[9] = b01.i[15]; + ( (int* ALIGNED( 64 ))a07 )[10] = b02.i[15]; + ( (int* ALIGNED( 64 ))a07 )[11] = b03.i[15]; + ( (int* ALIGNED( 64 ))a07 )[12] = b04.i[15]; + ( (int* ALIGNED( 64 ))a07 )[13] = b05.i[15]; + ( (int* ALIGNED( 64 ))a07 )[14] = b06.i[15]; + ( (int* ALIGNED( 64 ))a07 )[15] = b07.i[15]; +} + +inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b00.i[1]; + ( (int* ALIGNED( 64 ))a00 )[9] = b01.i[1]; + ( (int* ALIGNED( 64 ))a00 )[10] = b02.i[1]; + ( (int* ALIGNED( 64 ))a00 )[11] = b03.i[1]; + ( (int* ALIGNED( 64 ))a00 )[12] = b04.i[1]; + ( (int* ALIGNED( 64 ))a00 )[13] = b05.i[1]; + ( (int* ALIGNED( 64 ))a00 )[14] = b06.i[1]; + ( (int* ALIGNED( 64 ))a00 )[15] = b07.i[1]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a01 )[8] = b00.i[3]; + ( (int* ALIGNED( 64 ))a01 )[9] = b01.i[3]; + ( (int* ALIGNED( 64 ))a01 )[10] = b02.i[3]; + ( (int* ALIGNED( 64 ))a01 )[11] = b03.i[3]; + ( (int* ALIGNED( 64 ))a01 )[12] = b04.i[3]; + ( (int* ALIGNED( 64 ))a01 )[13] = b05.i[3]; + ( (int* ALIGNED( 64 ))a01 )[14] = b06.i[3]; + ( (int* ALIGNED( 64 ))a01 )[15] = b07.i[3]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a02 )[8] = b00.i[5]; + ( (int* ALIGNED( 64 ))a02 )[9] = b01.i[5]; + ( (int* ALIGNED( 64 ))a02 )[10] = b02.i[5]; + ( (int* ALIGNED( 64 ))a02 )[11] = b03.i[5]; + ( (int* ALIGNED( 64 ))a02 )[12] = b04.i[5]; + ( (int* ALIGNED( 64 ))a02 )[13] = b05.i[5]; + ( (int* ALIGNED( 64 ))a02 )[14] = b06.i[5]; + ( (int* ALIGNED( 64 ))a02 )[15] = b07.i[5]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a03 )[8] = b00.i[7]; + ( (int* ALIGNED( 64 ))a03 )[9] = b01.i[7]; + ( (int* ALIGNED( 64 ))a03 )[10] = b02.i[7]; + ( (int* ALIGNED( 64 ))a03 )[11] = b03.i[7]; + ( (int* ALIGNED( 64 ))a03 )[12] = b04.i[7]; + ( (int* ALIGNED( 64 ))a03 )[13] = b05.i[7]; + ( (int* ALIGNED( 64 ))a03 )[14] = b06.i[7]; + ( (int* ALIGNED( 64 ))a03 )[15] = b07.i[7]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a04 )[8] = b00.i[9]; + ( (int* ALIGNED( 64 ))a04 )[9] = b01.i[9]; + ( (int* ALIGNED( 64 ))a04 )[10] = b02.i[9]; + ( (int* ALIGNED( 64 ))a04 )[11] = b03.i[9]; + ( (int* ALIGNED( 64 ))a04 )[12] = b04.i[9]; + ( (int* ALIGNED( 64 ))a04 )[13] = b05.i[9]; + ( (int* ALIGNED( 64 ))a04 )[14] = b06.i[9]; + ( (int* ALIGNED( 64 ))a04 )[15] = b07.i[9]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a05 )[8] = b00.i[11]; + ( (int* ALIGNED( 64 ))a05 )[9] = b01.i[11]; + ( (int* ALIGNED( 64 ))a05 )[10] = b02.i[11]; + ( (int* ALIGNED( 64 ))a05 )[11] = b03.i[11]; + ( (int* ALIGNED( 64 ))a05 )[12] = b04.i[11]; + ( (int* ALIGNED( 64 ))a05 )[13] = b05.i[11]; + ( (int* ALIGNED( 64 ))a05 )[14] = b06.i[11]; + ( (int* ALIGNED( 64 ))a05 )[15] = b07.i[11]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a06 )[8] = b00.i[13]; + ( (int* ALIGNED( 64 ))a06 )[9] = b01.i[13]; + ( (int* ALIGNED( 64 ))a06 )[10] = b02.i[13]; + ( (int* ALIGNED( 64 ))a06 )[11] = b03.i[13]; + ( (int* ALIGNED( 64 ))a06 )[12] = b04.i[13]; + ( (int* ALIGNED( 64 ))a06 )[13] = b05.i[13]; + ( (int* ALIGNED( 64 ))a06 )[14] = b06.i[13]; + ( (int* ALIGNED( 64 ))a06 )[15] = b07.i[13]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a07 )[8] = b00.i[15]; + ( (int* ALIGNED( 64 ))a07 )[9] = b01.i[15]; + ( (int* ALIGNED( 64 ))a07 )[10] = b02.i[15]; + ( (int* ALIGNED( 64 ))a07 )[11] = b03.i[15]; + ( (int* ALIGNED( 64 ))a07 )[12] = b04.i[15]; + ( (int* ALIGNED( 64 ))a07 )[13] = b05.i[15]; + ( (int* ALIGNED( 64 ))a07 )[14] = b06.i[15]; + ( (int* ALIGNED( 64 ))a07 )[15] = b07.i[15]; + + ( (int* ALIGNED( 64 ))a08 )[0] = b08.i[0]; + ( (int* ALIGNED( 64 ))a08 )[1] = b09.i[0]; + ( (int* ALIGNED( 64 ))a08 )[2] = b10.i[0]; + ( (int* ALIGNED( 64 ))a08 )[3] = b11.i[0]; + ( (int* ALIGNED( 64 ))a08 )[4] = b12.i[0]; + ( (int* ALIGNED( 64 ))a08 )[5] = b13.i[0]; + ( (int* ALIGNED( 64 ))a08 )[6] = b14.i[0]; + ( (int* ALIGNED( 64 ))a08 )[7] = b15.i[0]; + ( (int* ALIGNED( 64 ))a08 )[8] = b08.i[1]; + ( (int* ALIGNED( 64 ))a08 )[9] = b09.i[1]; + ( (int* ALIGNED( 64 ))a08 )[10] = b10.i[1]; + ( (int* ALIGNED( 64 ))a08 )[11] = b11.i[1]; + ( (int* ALIGNED( 64 ))a08 )[12] = b12.i[1]; + ( (int* ALIGNED( 64 ))a08 )[13] = b13.i[1]; + ( (int* ALIGNED( 64 ))a08 )[14] = b14.i[1]; + ( (int* ALIGNED( 64 ))a08 )[15] = b15.i[1]; + + ( (int* ALIGNED( 64 ))a09 )[0] = b08.i[2]; + ( (int* ALIGNED( 64 ))a09 )[1] = b09.i[2]; + ( (int* ALIGNED( 64 ))a09 )[2] = b10.i[2]; + ( (int* ALIGNED( 64 ))a09 )[3] = b11.i[2]; + ( (int* ALIGNED( 64 ))a09 )[4] = b12.i[2]; + ( (int* ALIGNED( 64 ))a09 )[5] = b13.i[2]; + ( (int* ALIGNED( 64 ))a09 )[6] = b14.i[2]; + ( (int* ALIGNED( 64 ))a09 )[7] = b15.i[2]; + ( (int* ALIGNED( 64 ))a09 )[8] = b08.i[3]; + ( (int* ALIGNED( 64 ))a09 )[9] = b09.i[3]; + ( (int* ALIGNED( 64 ))a09 )[10] = b10.i[3]; + ( (int* ALIGNED( 64 ))a09 )[11] = b11.i[3]; + ( (int* ALIGNED( 64 ))a09 )[12] = b12.i[3]; + ( (int* ALIGNED( 64 ))a09 )[13] = b13.i[3]; + ( (int* ALIGNED( 64 ))a09 )[14] = b14.i[3]; + ( (int* ALIGNED( 64 ))a09 )[15] = b15.i[3]; + + ( (int* ALIGNED( 64 ))a10 )[0] = b08.i[4]; + ( (int* ALIGNED( 64 ))a10 )[1] = b09.i[4]; + ( (int* ALIGNED( 64 ))a10 )[2] = b10.i[4]; + ( (int* ALIGNED( 64 ))a10 )[3] = b11.i[4]; + ( (int* ALIGNED( 64 ))a10 )[4] = b12.i[4]; + ( (int* ALIGNED( 64 ))a10 )[5] = b13.i[4]; + ( (int* ALIGNED( 64 ))a10 )[6] = b14.i[4]; + ( (int* ALIGNED( 64 ))a10 )[7] = b15.i[4]; + ( (int* ALIGNED( 64 ))a10 )[8] = b08.i[5]; + ( (int* ALIGNED( 64 ))a10 )[9] = b09.i[5]; + ( (int* ALIGNED( 64 ))a10 )[10] = b10.i[5]; + ( (int* ALIGNED( 64 ))a10 )[11] = b11.i[5]; + ( (int* ALIGNED( 64 ))a10 )[12] = b12.i[5]; + ( (int* ALIGNED( 64 ))a10 )[13] = b13.i[5]; + ( (int* ALIGNED( 64 ))a10 )[14] = b14.i[5]; + ( (int* ALIGNED( 64 ))a10 )[15] = b15.i[5]; + + ( (int* ALIGNED( 64 ))a11 )[0] = b08.i[6]; + ( (int* ALIGNED( 64 ))a11 )[1] = b09.i[6]; + ( (int* ALIGNED( 64 ))a11 )[2] = b10.i[6]; + ( (int* ALIGNED( 64 ))a11 )[3] = b11.i[6]; + ( (int* ALIGNED( 64 ))a11 )[4] = b12.i[6]; + ( (int* ALIGNED( 64 ))a11 )[5] = b13.i[6]; + ( (int* ALIGNED( 64 ))a11 )[6] = b14.i[6]; + ( (int* ALIGNED( 64 ))a11 )[7] = b15.i[6]; + ( (int* ALIGNED( 64 ))a11 )[8] = b08.i[7]; + ( (int* ALIGNED( 64 ))a11 )[9] = b09.i[7]; + ( (int* ALIGNED( 64 ))a11 )[10] = b10.i[7]; + ( (int* ALIGNED( 64 ))a11 )[11] = b11.i[7]; + ( (int* ALIGNED( 64 ))a11 )[12] = b12.i[7]; + ( (int* ALIGNED( 64 ))a11 )[13] = b13.i[7]; + ( (int* ALIGNED( 64 ))a11 )[14] = b14.i[7]; + ( (int* ALIGNED( 64 ))a11 )[15] = b15.i[7]; + + ( (int* ALIGNED( 64 ))a12 )[0] = b08.i[8]; + ( (int* ALIGNED( 64 ))a12 )[1] = b09.i[8]; + ( (int* ALIGNED( 64 ))a12 )[2] = b10.i[8]; + ( (int* ALIGNED( 64 ))a12 )[3] = b11.i[8]; + ( (int* ALIGNED( 64 ))a12 )[4] = b12.i[8]; + ( (int* ALIGNED( 64 ))a12 )[5] = b13.i[8]; + ( (int* ALIGNED( 64 ))a12 )[6] = b14.i[8]; + ( (int* ALIGNED( 64 ))a12 )[7] = b15.i[8]; + ( (int* ALIGNED( 64 ))a12 )[8] = b08.i[9]; + ( (int* ALIGNED( 64 ))a12 )[9] = b09.i[9]; + ( (int* ALIGNED( 64 ))a12 )[10] = b10.i[9]; + ( (int* ALIGNED( 64 ))a12 )[11] = b11.i[9]; + ( (int* ALIGNED( 64 ))a12 )[12] = b12.i[9]; + ( (int* ALIGNED( 64 ))a12 )[13] = b13.i[9]; + ( (int* ALIGNED( 64 ))a12 )[14] = b14.i[9]; + ( (int* ALIGNED( 64 ))a12 )[15] = b15.i[9]; + + ( (int* ALIGNED( 64 ))a13 )[0] = b08.i[10]; + ( (int* ALIGNED( 64 ))a13 )[1] = b09.i[10]; + ( (int* ALIGNED( 64 ))a13 )[2] = b10.i[10]; + ( (int* ALIGNED( 64 ))a13 )[3] = b11.i[10]; + ( (int* ALIGNED( 64 ))a13 )[4] = b12.i[10]; + ( (int* ALIGNED( 64 ))a13 )[5] = b13.i[10]; + ( (int* ALIGNED( 64 ))a13 )[6] = b14.i[10]; + ( (int* ALIGNED( 64 ))a13 )[7] = b15.i[10]; + ( (int* ALIGNED( 64 ))a13 )[8] = b08.i[11]; + ( (int* ALIGNED( 64 ))a13 )[9] = b09.i[11]; + ( (int* ALIGNED( 64 ))a13 )[10] = b10.i[11]; + ( (int* ALIGNED( 64 ))a13 )[11] = b11.i[11]; + ( (int* ALIGNED( 64 ))a13 )[12] = b12.i[11]; + ( (int* ALIGNED( 64 ))a13 )[13] = b13.i[11]; + ( (int* ALIGNED( 64 ))a13 )[14] = b14.i[11]; + ( (int* ALIGNED( 64 ))a13 )[15] = b15.i[11]; + + ( (int* ALIGNED( 64 ))a14 )[0] = b08.i[12]; + ( (int* ALIGNED( 64 ))a14 )[1] = b09.i[12]; + ( (int* ALIGNED( 64 ))a14 )[2] = b10.i[12]; + ( (int* ALIGNED( 64 ))a14 )[3] = b11.i[12]; + ( (int* ALIGNED( 64 ))a14 )[4] = b12.i[12]; + ( (int* ALIGNED( 64 ))a14 )[5] = b13.i[12]; + ( (int* ALIGNED( 64 ))a14 )[6] = b14.i[12]; + ( (int* ALIGNED( 64 ))a14 )[7] = b15.i[12]; + ( (int* ALIGNED( 64 ))a14 )[8] = b08.i[13]; + ( (int* ALIGNED( 64 ))a14 )[9] = b09.i[13]; + ( (int* ALIGNED( 64 ))a14 )[10] = b10.i[13]; + ( (int* ALIGNED( 64 ))a14 )[11] = b11.i[13]; + ( (int* ALIGNED( 64 ))a14 )[12] = b12.i[13]; + ( (int* ALIGNED( 64 ))a14 )[13] = b13.i[13]; + ( (int* ALIGNED( 64 ))a14 )[14] = b14.i[13]; + ( (int* ALIGNED( 64 ))a14 )[15] = b15.i[13]; + + ( (int* ALIGNED( 64 ))a15 )[0] = b08.i[14]; + ( (int* ALIGNED( 64 ))a15 )[1] = b09.i[14]; + ( (int* ALIGNED( 64 ))a15 )[2] = b10.i[14]; + ( (int* ALIGNED( 64 ))a15 )[3] = b11.i[14]; + ( (int* ALIGNED( 64 ))a15 )[4] = b12.i[14]; + ( (int* ALIGNED( 64 ))a15 )[5] = b13.i[14]; + ( (int* ALIGNED( 64 ))a15 )[6] = b14.i[14]; + ( (int* ALIGNED( 64 ))a15 )[7] = b15.i[14]; + ( (int* ALIGNED( 64 ))a15 )[8] = b08.i[15]; + ( (int* ALIGNED( 64 ))a15 )[9] = b09.i[15]; + ( (int* ALIGNED( 64 ))a15 )[10] = b10.i[15]; + ( (int* ALIGNED( 64 ))a15 )[11] = b11.i[15]; + ( (int* ALIGNED( 64 ))a15 )[12] = b12.i[15]; + ( (int* ALIGNED( 64 ))a15 )[13] = b13.i[15]; + ( (int* ALIGNED( 64 ))a15 )[14] = b14.i[15]; + ( (int* ALIGNED( 64 ))a15 )[15] = b15.i[15]; +} + +////////////// +// v16int class + +class v16int : public v16 +{ // v16int prefix unary operator friends - friend inline v16int operator +( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator ~( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16int & a ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator~( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16int prefix increment / decrement operator friends - friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a ) ALWAYS_INLINE; // v16int postfix increment / decrement operator friends - friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a, int ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a, int ) ALWAYS_INLINE; // v16int binary operator friends - friend inline v16int operator +( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator *( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator /( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator %( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ^( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator |( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator*(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator/( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator%( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator^( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator|( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int logical operator friends - friend inline v16int operator <( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16int abs( const v16int &a ) ALWAYS_INLINE; - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; + friend inline v16int abs( const v16int& a ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& t, + const v16& f ) ALWAYS_INLINE; // v16float unary operator friends - friend inline v16int operator !( const v16float & a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float miscellaneous friends - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; public: - // v16int constructors / destructors - v16int() {} // Default constructor + v16int() {} // Default constructor - v16int( const v16int &a ) // Copy constructor + v16int( const v16int& a ) // Copy constructor { - i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3]; - i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7]; - i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11]; - i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; + i[8] = a.i[8]; + i[9] = a.i[9]; + i[10] = a.i[10]; + i[11] = a.i[11]; + i[12] = a.i[12]; + i[13] = a.i[13]; + i[14] = a.i[14]; + i[15] = a.i[15]; } - v16int( const v16 &a ) // Init from mixed + v16int( const v16& a ) // Init from mixed { - i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3]; - i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7]; - i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11]; - i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; + i[8] = a.i[8]; + i[9] = a.i[9]; + i[10] = a.i[10]; + i[11] = a.i[11]; + i[12] = a.i[12]; + i[13] = a.i[13]; + i[14] = a.i[14]; + i[15] = a.i[15]; } - v16int( int a ) // Init from scalar + v16int( int a ) // Init from scalar { - i[ 0] = a; i[ 1] = a; i[ 2] = a; i[ 3] = a; - i[ 4] = a; i[ 5] = a; i[ 6] = a; i[ 7] = a; - i[ 8] = a; i[ 9] = a; i[10] = a; i[11] = a; - i[12] = a; i[13] = a; i[14] = a; i[15] = a; + i[0] = a; + i[1] = a; + i[2] = a; + i[3] = a; + i[4] = a; + i[5] = a; + i[6] = a; + i[7] = a; + i[8] = a; + i[9] = a; + i[10] = a; + i[11] = a; + i[12] = a; + i[13] = a; + i[14] = a; + i[15] = a; } - v16int( int i00, int i01, int i02, int i03, - int i04, int i05, int i06, int i07, - int i08, int i09, int i10, int i11, - int i12, int i13, int i14, int i15 ) // Init from scalars + v16int( int i00, int i01, int i02, int i03, int i04, int i05, int i06, + int i07, int i08, int i09, int i10, int i11, int i12, int i13, + int i14, int i15 ) // Init from scalars { - i[ 0] = i00; i[ 1] = i01; i[ 2] = i02; i[ 3] = i03; - i[ 4] = i04; i[ 5] = i05; i[ 6] = i06; i[ 7] = i07; - i[ 8] = i08; i[ 9] = i09; i[10] = i10; i[11] = i11; - i[12] = i12; i[13] = i13; i[14] = i14; i[15] = i15; + i[0] = i00; + i[1] = i01; + i[2] = i02; + i[3] = i03; + i[4] = i04; + i[5] = i05; + i[6] = i06; + i[7] = i07; + i[8] = i08; + i[9] = i09; + i[10] = i10; + i[11] = i11; + i[12] = i12; + i[13] = i13; + i[14] = i14; + i[15] = i15; } - ~v16int() {} // Destructor + ~v16int() {} // Destructor // v16int assignment operators -# define ASSIGN(op) \ - inline v16int &operator op( const v16int &b ) \ - { \ - i[ 0] op b.i[ 0]; \ - i[ 1] op b.i[ 1]; \ - i[ 2] op b.i[ 2]; \ - i[ 3] op b.i[ 3]; \ - i[ 4] op b.i[ 4]; \ - i[ 5] op b.i[ 5]; \ - i[ 6] op b.i[ 6]; \ - i[ 7] op b.i[ 7]; \ - i[ 8] op b.i[ 8]; \ - i[ 9] op b.i[ 9]; \ - i[10] op b.i[10]; \ - i[11] op b.i[11]; \ - i[12] op b.i[12]; \ - i[13] op b.i[13]; \ - i[14] op b.i[14]; \ - i[15] op b.i[15]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16int& operator op( const v16int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + i[4] op b.i[4]; \ + i[5] op b.i[5]; \ + i[6] op b.i[6]; \ + i[7] op b.i[7]; \ + i[8] op b.i[8]; \ + i[9] op b.i[9]; \ + i[10] op b.i[10]; \ + i[11] op b.i[11]; \ + i[12] op b.i[12]; \ + i[13] op b.i[13]; \ + i[14] op b.i[14]; \ + i[15] op b.i[15]; \ + return *this; \ } - ASSIGN( =) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v16int member access operator - inline int &operator []( int n ) - { - return i[n]; + inline int& operator[]( int n ) { return i[n]; } + + inline int operator()( int n ) { return i[n]; } +}; + +// v16int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v16int operator op( const v16int& a ) \ + { \ + v16int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + b.i[8] = ( op a.i[8] ); \ + b.i[9] = ( op a.i[9] ); \ + b.i[10] = ( op a.i[10] ); \ + b.i[11] = ( op a.i[11] ); \ + b.i[12] = ( op a.i[12] ); \ + b.i[13] = ( op a.i[13] ); \ + b.i[14] = ( op a.i[14] ); \ + b.i[15] = ( op a.i[15] ); \ + return b; \ } - inline int operator ()( int n ) - { - return i[n]; - } - }; - - // v16int prefix unary operators - -# define PREFIX_UNARY(op) \ - inline v16int operator op( const v16int & a ) \ - { \ - v16int b; \ - b.i[ 0] = (op a.i[ 0]); \ - b.i[ 1] = (op a.i[ 1]); \ - b.i[ 2] = (op a.i[ 2]); \ - b.i[ 3] = (op a.i[ 3]); \ - b.i[ 4] = (op a.i[ 4]); \ - b.i[ 5] = (op a.i[ 5]); \ - b.i[ 6] = (op a.i[ 6]); \ - b.i[ 7] = (op a.i[ 7]); \ - b.i[ 8] = (op a.i[ 8]); \ - b.i[ 9] = (op a.i[ 9]); \ - b.i[10] = (op a.i[10]); \ - b.i[11] = (op a.i[11]); \ - b.i[12] = (op a.i[12]); \ - b.i[13] = (op a.i[13]); \ - b.i[14] = (op a.i[14]); \ - b.i[15] = (op a.i[15]); \ - return b; \ - } - - PREFIX_UNARY(+) - PREFIX_UNARY(-) - - inline v16int operator !( const v16int & a ) - { +PREFIX_UNARY( +) +PREFIX_UNARY( -) + +inline v16int operator!( const v16int& a ) +{ v16int b; - b.i[ 0] = - ( !a.i[ 0] ); - b.i[ 1] = - ( !a.i[ 1] ); - b.i[ 2] = - ( !a.i[ 2] ); - b.i[ 3] = - ( !a.i[ 3] ); - b.i[ 4] = - ( !a.i[ 4] ); - b.i[ 5] = - ( !a.i[ 5] ); - b.i[ 6] = - ( !a.i[ 6] ); - b.i[ 7] = - ( !a.i[ 7] ); - b.i[ 8] = - ( !a.i[ 8] ); - b.i[ 9] = - ( !a.i[ 9] ); - b.i[10] = - ( !a.i[10] ); - b.i[11] = - ( !a.i[11] ); - b.i[12] = - ( !a.i[12] ); - b.i[13] = - ( !a.i[13] ); - b.i[14] = - ( !a.i[14] ); - b.i[15] = - ( !a.i[15] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); + b.i[4] = -( !a.i[4] ); + b.i[5] = -( !a.i[5] ); + b.i[6] = -( !a.i[6] ); + b.i[7] = -( !a.i[7] ); + b.i[8] = -( !a.i[8] ); + b.i[9] = -( !a.i[9] ); + b.i[10] = -( !a.i[10] ); + b.i[11] = -( !a.i[11] ); + b.i[12] = -( !a.i[12] ); + b.i[13] = -( !a.i[13] ); + b.i[14] = -( !a.i[14] ); + b.i[15] = -( !a.i[15] ); return b; - } - - PREFIX_UNARY(~) - -# undef PREFIX_UNARY - - // v16int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v16int operator op( v16int & a ) \ - { \ - v16int b; \ - b.i[ 0] = ( op a.i[ 0] ); \ - b.i[ 1] = ( op a.i[ 1] ); \ - b.i[ 2] = ( op a.i[ 2] ); \ - b.i[ 3] = ( op a.i[ 3] ); \ - b.i[ 4] = ( op a.i[ 4] ); \ - b.i[ 5] = ( op a.i[ 5] ); \ - b.i[ 6] = ( op a.i[ 6] ); \ - b.i[ 7] = ( op a.i[ 7] ); \ - b.i[ 8] = ( op a.i[ 8] ); \ - b.i[ 9] = ( op a.i[ 9] ); \ - b.i[10] = ( op a.i[10] ); \ - b.i[11] = ( op a.i[11] ); \ - b.i[12] = ( op a.i[12] ); \ - b.i[13] = ( op a.i[13] ); \ - b.i[14] = ( op a.i[14] ); \ - b.i[15] = ( op a.i[15] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v16int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v16int operator op( v16int & a, int ) \ - { \ - v16int b; \ - b.i[ 0] = ( a.i[ 0] op ); \ - b.i[ 1] = ( a.i[ 1] op ); \ - b.i[ 2] = ( a.i[ 2] op ); \ - b.i[ 3] = ( a.i[ 3] op ); \ - b.i[ 4] = ( a.i[ 4] op ); \ - b.i[ 5] = ( a.i[ 5] op ); \ - b.i[ 6] = ( a.i[ 6] op ); \ - b.i[ 7] = ( a.i[ 7] op ); \ - b.i[ 8] = ( a.i[ 8] op ); \ - b.i[ 9] = ( a.i[ 9] op ); \ - b.i[10] = ( a.i[10] op ); \ - b.i[11] = ( a.i[11] op ); \ - b.i[12] = ( a.i[12] op ); \ - b.i[13] = ( a.i[13] op ); \ - b.i[14] = ( a.i[14] op ); \ - b.i[15] = ( a.i[15] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v16int binary operators - -# define BINARY(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - c.i[ 0] = a.i[ 0] op b.i[ 0]; \ - c.i[ 1] = a.i[ 1] op b.i[ 1]; \ - c.i[ 2] = a.i[ 2] op b.i[ 2]; \ - c.i[ 3] = a.i[ 3] op b.i[ 3]; \ - c.i[ 4] = a.i[ 4] op b.i[ 4]; \ - c.i[ 5] = a.i[ 5] op b.i[ 5]; \ - c.i[ 6] = a.i[ 6] op b.i[ 6]; \ - c.i[ 7] = a.i[ 7] op b.i[ 7]; \ - c.i[ 8] = a.i[ 8] op b.i[ 8]; \ - c.i[ 9] = a.i[ 9] op b.i[ 9]; \ - c.i[10] = a.i[10] op b.i[10]; \ - c.i[11] = a.i[11] op b.i[11]; \ - c.i[12] = a.i[12] op b.i[12]; \ - c.i[13] = a.i[13] op b.i[13]; \ - c.i[14] = a.i[14] op b.i[14]; \ - c.i[15] = a.i[15] op b.i[15]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v16int logical operators - -# define LOGICAL(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - c.i[ 0] = - ( a.i[ 0] op b.i[ 0] ); \ - c.i[ 1] = - ( a.i[ 1] op b.i[ 1] ); \ - c.i[ 2] = - ( a.i[ 2] op b.i[ 2] ); \ - c.i[ 3] = - ( a.i[ 3] op b.i[ 3] ); \ - c.i[ 4] = - ( a.i[ 4] op b.i[ 4] ); \ - c.i[ 5] = - ( a.i[ 5] op b.i[ 5] ); \ - c.i[ 6] = - ( a.i[ 6] op b.i[ 6] ); \ - c.i[ 7] = - ( a.i[ 7] op b.i[ 7] ); \ - c.i[ 8] = - ( a.i[ 8] op b.i[ 8] ); \ - c.i[ 9] = - ( a.i[ 9] op b.i[ 9] ); \ - c.i[10] = - ( a.i[10] op b.i[10] ); \ - c.i[11] = - ( a.i[11] op b.i[11] ); \ - c.i[12] = - ( a.i[12] op b.i[12] ); \ - c.i[13] = - ( a.i[13] op b.i[13] ); \ - c.i[14] = - ( a.i[14] op b.i[14] ); \ - c.i[15] = - ( a.i[15] op b.i[15] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v16int miscellaneous functions - - inline v16int abs( const v16int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v16int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a ) \ + { \ + v16int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + b.i[8] = ( op a.i[8] ); \ + b.i[9] = ( op a.i[9] ); \ + b.i[10] = ( op a.i[10] ); \ + b.i[11] = ( op a.i[11] ); \ + b.i[12] = ( op a.i[12] ); \ + b.i[13] = ( op a.i[13] ); \ + b.i[14] = ( op a.i[14] ); \ + b.i[15] = ( op a.i[15] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v16int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a, int ) \ + { \ + v16int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + b.i[4] = ( a.i[4] op ); \ + b.i[5] = ( a.i[5] op ); \ + b.i[6] = ( a.i[6] op ); \ + b.i[7] = ( a.i[7] op ); \ + b.i[8] = ( a.i[8] op ); \ + b.i[9] = ( a.i[9] op ); \ + b.i[10] = ( a.i[10] op ); \ + b.i[11] = ( a.i[11] op ); \ + b.i[12] = ( a.i[12] op ); \ + b.i[13] = ( a.i[13] op ); \ + b.i[14] = ( a.i[14] op ); \ + b.i[15] = ( a.i[15] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v16int binary operators + +#define BINARY( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + c.i[4] = a.i[4] op b.i[4]; \ + c.i[5] = a.i[5] op b.i[5]; \ + c.i[6] = a.i[6] op b.i[6]; \ + c.i[7] = a.i[7] op b.i[7]; \ + c.i[8] = a.i[8] op b.i[8]; \ + c.i[9] = a.i[9] op b.i[9]; \ + c.i[10] = a.i[10] op b.i[10]; \ + c.i[11] = a.i[11] op b.i[11]; \ + c.i[12] = a.i[12] op b.i[12]; \ + c.i[13] = a.i[13] op b.i[13]; \ + c.i[14] = a.i[14] op b.i[14]; \ + c.i[15] = a.i[15] op b.i[15]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( ^) +BINARY( & ) +BINARY( | ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v16int logical operators + +#define LOGICAL( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + c.i[4] = -( a.i[4] op b.i[4] ); \ + c.i[5] = -( a.i[5] op b.i[5] ); \ + c.i[6] = -( a.i[6] op b.i[6] ); \ + c.i[7] = -( a.i[7] op b.i[7] ); \ + c.i[8] = -( a.i[8] op b.i[8] ); \ + c.i[9] = -( a.i[9] op b.i[9] ); \ + c.i[10] = -( a.i[10] op b.i[10] ); \ + c.i[11] = -( a.i[11] op b.i[11] ); \ + c.i[12] = -( a.i[12] op b.i[12] ); \ + c.i[13] = -( a.i[13] op b.i[13] ); \ + c.i[14] = -( a.i[14] op b.i[14] ); \ + c.i[15] = -( a.i[15] op b.i[15] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v16int miscellaneous functions + +inline v16int abs( const v16int& a ) +{ v16int b; - b.i[ 0] = ( a.i[ 0] >= 0 ) ? a.i[ 0] : - a.i[ 0]; - b.i[ 1] = ( a.i[ 1] >= 0 ) ? a.i[ 1] : - a.i[ 1]; - b.i[ 2] = ( a.i[ 2] >= 0 ) ? a.i[ 2] : - a.i[ 2]; - b.i[ 3] = ( a.i[ 3] >= 0 ) ? a.i[ 3] : - a.i[ 3]; - b.i[ 4] = ( a.i[ 4] >= 0 ) ? a.i[ 4] : - a.i[ 4]; - b.i[ 5] = ( a.i[ 5] >= 0 ) ? a.i[ 5] : - a.i[ 5]; - b.i[ 6] = ( a.i[ 6] >= 0 ) ? a.i[ 6] : - a.i[ 6]; - b.i[ 7] = ( a.i[ 7] >= 0 ) ? a.i[ 7] : - a.i[ 7]; - b.i[ 8] = ( a.i[ 8] >= 0 ) ? a.i[ 8] : - a.i[ 8]; - b.i[ 9] = ( a.i[ 9] >= 0 ) ? a.i[ 9] : - a.i[ 9]; - b.i[10] = ( a.i[10] >= 0 ) ? a.i[10] : - a.i[10]; - b.i[11] = ( a.i[11] >= 0 ) ? a.i[11] : - a.i[11]; - b.i[12] = ( a.i[12] >= 0 ) ? a.i[12] : - a.i[12]; - b.i[13] = ( a.i[13] >= 0 ) ? a.i[13] : - a.i[13]; - b.i[14] = ( a.i[14] >= 0 ) ? a.i[14] : - a.i[14]; - b.i[15] = ( a.i[15] >= 0 ) ? a.i[15] : - a.i[15]; + b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; + b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1]; + b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2]; + b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; + b.i[4] = ( a.i[4] >= 0 ) ? a.i[4] : -a.i[4]; + b.i[5] = ( a.i[5] >= 0 ) ? a.i[5] : -a.i[5]; + b.i[6] = ( a.i[6] >= 0 ) ? a.i[6] : -a.i[6]; + b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7]; + b.i[8] = ( a.i[8] >= 0 ) ? a.i[8] : -a.i[8]; + b.i[9] = ( a.i[9] >= 0 ) ? a.i[9] : -a.i[9]; + b.i[10] = ( a.i[10] >= 0 ) ? a.i[10] : -a.i[10]; + b.i[11] = ( a.i[11] >= 0 ) ? a.i[11] : -a.i[11]; + b.i[12] = ( a.i[12] >= 0 ) ? a.i[12] : -a.i[12]; + b.i[13] = ( a.i[13] >= 0 ) ? a.i[13] : -a.i[13]; + b.i[14] = ( a.i[14] >= 0 ) ? a.i[14] : -a.i[14]; + b.i[15] = ( a.i[15] >= 0 ) ? a.i[15] : -a.i[15]; return b; - } +} - inline v16 czero( const v16int &c, const v16 &a ) - { +inline v16 czero( const v16int& c, const v16& a ) +{ v16 b; - b.i[ 0] = a.i[ 0] & ~c.i[ 0]; - b.i[ 1] = a.i[ 1] & ~c.i[ 1]; - b.i[ 2] = a.i[ 2] & ~c.i[ 2]; - b.i[ 3] = a.i[ 3] & ~c.i[ 3]; - b.i[ 4] = a.i[ 4] & ~c.i[ 4]; - b.i[ 5] = a.i[ 5] & ~c.i[ 5]; - b.i[ 6] = a.i[ 6] & ~c.i[ 6]; - b.i[ 7] = a.i[ 7] & ~c.i[ 7]; - b.i[ 8] = a.i[ 8] & ~c.i[ 8]; - b.i[ 9] = a.i[ 9] & ~c.i[ 9]; + b.i[0] = a.i[0] & ~c.i[0]; + b.i[1] = a.i[1] & ~c.i[1]; + b.i[2] = a.i[2] & ~c.i[2]; + b.i[3] = a.i[3] & ~c.i[3]; + b.i[4] = a.i[4] & ~c.i[4]; + b.i[5] = a.i[5] & ~c.i[5]; + b.i[6] = a.i[6] & ~c.i[6]; + b.i[7] = a.i[7] & ~c.i[7]; + b.i[8] = a.i[8] & ~c.i[8]; + b.i[9] = a.i[9] & ~c.i[9]; b.i[10] = a.i[10] & ~c.i[10]; b.i[11] = a.i[11] & ~c.i[11]; b.i[12] = a.i[12] & ~c.i[12]; @@ -3363,22 +3382,22 @@ namespace v16 b.i[15] = a.i[15] & ~c.i[15]; return b; - } +} - inline v16 notczero( const v16int &c, const v16 &a ) - { +inline v16 notczero( const v16int& c, const v16& a ) +{ v16 b; - b.i[ 0] = a.i[ 0] & c.i[ 0]; - b.i[ 1] = a.i[ 1] & c.i[ 1]; - b.i[ 2] = a.i[ 2] & c.i[ 2]; - b.i[ 3] = a.i[ 3] & c.i[ 3]; - b.i[ 4] = a.i[ 4] & c.i[ 4]; - b.i[ 5] = a.i[ 5] & c.i[ 5]; - b.i[ 6] = a.i[ 6] & c.i[ 6]; - b.i[ 7] = a.i[ 7] & c.i[ 7]; - b.i[ 8] = a.i[ 8] & c.i[ 8]; - b.i[ 9] = a.i[ 9] & c.i[ 9]; + b.i[0] = a.i[0] & c.i[0]; + b.i[1] = a.i[1] & c.i[1]; + b.i[2] = a.i[2] & c.i[2]; + b.i[3] = a.i[3] & c.i[3]; + b.i[4] = a.i[4] & c.i[4]; + b.i[5] = a.i[5] & c.i[5]; + b.i[6] = a.i[6] & c.i[6]; + b.i[7] = a.i[7] & c.i[7]; + b.i[8] = a.i[8] & c.i[8]; + b.i[9] = a.i[9] & c.i[9]; b.i[10] = a.i[10] & c.i[10]; b.i[11] = a.i[11] & c.i[11]; b.i[12] = a.i[12] & c.i[12]; @@ -3387,22 +3406,22 @@ namespace v16 b.i[15] = a.i[15] & c.i[15]; return b; - } +} - inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) - { +inline v16 merge( const v16int& c, const v16& t, const v16& f ) +{ v16 m; - m.i[ 0] = ( f.i[ 0] & ~c.i[ 0] ) | ( t.i[ 0] & c.i[ 0] ); - m.i[ 1] = ( f.i[ 1] & ~c.i[ 1] ) | ( t.i[ 1] & c.i[ 1] ); - m.i[ 2] = ( f.i[ 2] & ~c.i[ 2] ) | ( t.i[ 2] & c.i[ 2] ); - m.i[ 3] = ( f.i[ 3] & ~c.i[ 3] ) | ( t.i[ 3] & c.i[ 3] ); - m.i[ 4] = ( f.i[ 4] & ~c.i[ 4] ) | ( t.i[ 4] & c.i[ 4] ); - m.i[ 5] = ( f.i[ 5] & ~c.i[ 5] ) | ( t.i[ 5] & c.i[ 5] ); - m.i[ 6] = ( f.i[ 6] & ~c.i[ 6] ) | ( t.i[ 6] & c.i[ 6] ); - m.i[ 7] = ( f.i[ 7] & ~c.i[ 7] ) | ( t.i[ 7] & c.i[ 7] ); - m.i[ 8] = ( f.i[ 8] & ~c.i[ 8] ) | ( t.i[ 8] & c.i[ 8] ); - m.i[ 9] = ( f.i[ 9] & ~c.i[ 9] ) | ( t.i[ 9] & c.i[ 9] ); + m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); + m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] ); + m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] ); + m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); + m.i[4] = ( f.i[4] & ~c.i[4] ) | ( t.i[4] & c.i[4] ); + m.i[5] = ( f.i[5] & ~c.i[5] ) | ( t.i[5] & c.i[5] ); + m.i[6] = ( f.i[6] & ~c.i[6] ) | ( t.i[6] & c.i[6] ); + m.i[7] = ( f.i[7] & ~c.i[7] ) | ( t.i[7] & c.i[7] ); + m.i[8] = ( f.i[8] & ~c.i[8] ) | ( t.i[8] & c.i[8] ); + m.i[9] = ( f.i[9] & ~c.i[9] ) | ( t.i[9] & c.i[9] ); m.i[10] = ( f.i[10] & ~c.i[10] ) | ( t.i[10] & c.i[10] ); m.i[11] = ( f.i[11] & ~c.i[11] ) | ( t.i[11] & c.i[11] ); m.i[12] = ( f.i[12] & ~c.i[12] ) | ( t.i[12] & c.i[12] ); @@ -3411,186 +3430,263 @@ namespace v16 m.i[15] = ( f.i[15] & ~c.i[15] ) | ( t.i[15] & c.i[15] ); return m; - } +} - //////////////// - // v16float class +//////////////// +// v16float class - class v16float : public v16 - { +class v16float : public v16 +{ // v16float prefix unary operator friends - friend inline v16float operator +( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator ~( const v16float &a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16float &a ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator~( const v16float& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16float prefix increment / decrement operator friends - friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a ) ALWAYS_INLINE; // v16float postfix increment / decrement operator friends - friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a, int ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a, int ) ALWAYS_INLINE; // v16float binary operator friends - friend inline v16float operator +( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator *( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator /( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator*(const v16float& a, + const v16float& b)ALWAYS_INLINE; + friend inline v16float operator/( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float math library friends -# define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v16float fn( const v16float &a, \ - const v16float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v16float fn( const v16float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v16float fn( const v16float& a, const v16float& b ) \ + ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v16float miscellaneous friends - friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rsqrt ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; + friend inline v16float rsqrt_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rsqrt( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp( const v16float& a ) ALWAYS_INLINE; + friend inline v16float fma( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fnms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline void increment_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void decrement_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void scale_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; public: - // v16float constructors / destructors - v16float() {} // Default constructor + v16float() {} // Default constructor - v16float( const v16float &a ) // Copy constructor + v16float( const v16float& a ) // Copy constructor { - f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3]; - f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7]; - f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11]; - f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; + f[8] = a.f[8]; + f[9] = a.f[9]; + f[10] = a.f[10]; + f[11] = a.f[11]; + f[12] = a.f[12]; + f[13] = a.f[13]; + f[14] = a.f[14]; + f[15] = a.f[15]; } - v16float( const v16 &a ) // Init from mixed + v16float( const v16& a ) // Init from mixed { - f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3]; - f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7]; - f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11]; - f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; + f[8] = a.f[8]; + f[9] = a.f[9]; + f[10] = a.f[10]; + f[11] = a.f[11]; + f[12] = a.f[12]; + f[13] = a.f[13]; + f[14] = a.f[14]; + f[15] = a.f[15]; } - v16float( float a ) // Init from scalar + v16float( float a ) // Init from scalar { - f[ 0] = a; f[ 1] = a; f[ 2] = a; f[ 3] = a; - f[ 4] = a; f[ 5] = a; f[ 6] = a; f[ 7] = a; - f[ 8] = a; f[ 9] = a; f[10] = a; f[11] = a; - f[12] = a; f[13] = a; f[14] = a; f[15] = a; + f[0] = a; + f[1] = a; + f[2] = a; + f[3] = a; + f[4] = a; + f[5] = a; + f[6] = a; + f[7] = a; + f[8] = a; + f[9] = a; + f[10] = a; + f[11] = a; + f[12] = a; + f[13] = a; + f[14] = a; + f[15] = a; } - v16float( float f00, float f01, float f02, float f03, - float f04, float f05, float f06, float f07, - float f08, float f09, float f10, float f11, - float f12, float f13, float f14, float f15 ) // Init from scalars + v16float( float f00, float f01, float f02, float f03, float f04, float f05, + float f06, float f07, float f08, float f09, float f10, float f11, + float f12, float f13, float f14, float f15 ) // Init from scalars { - f[ 0] = f00; f[ 1] = f01; f[ 2] = f02; f[ 3] = f03; - f[ 4] = f04; f[ 5] = f05; f[ 6] = f06; f[ 7] = f07; - f[ 8] = f08; f[ 9] = f09; f[10] = f10; f[11] = f11; - f[12] = f12; f[13] = f13; f[14] = f14; f[15] = f15; + f[0] = f00; + f[1] = f01; + f[2] = f02; + f[3] = f03; + f[4] = f04; + f[5] = f05; + f[6] = f06; + f[7] = f07; + f[8] = f08; + f[9] = f09; + f[10] = f10; + f[11] = f11; + f[12] = f12; + f[13] = f13; + f[14] = f14; + f[15] = f15; } - ~v16float() {} // Destructor + ~v16float() {} // Destructor // v16float assignment operators -# define ASSIGN(op) \ - inline v16float &operator op( const v16float &b ) \ - { \ - f[ 0] op b.f[ 0]; \ - f[ 1] op b.f[ 1]; \ - f[ 2] op b.f[ 2]; \ - f[ 3] op b.f[ 3]; \ - f[ 4] op b.f[ 4]; \ - f[ 5] op b.f[ 5]; \ - f[ 6] op b.f[ 6]; \ - f[ 7] op b.f[ 7]; \ - f[ 8] op b.f[ 8]; \ - f[ 9] op b.f[ 9]; \ - f[10] op b.f[10]; \ - f[11] op b.f[11]; \ - f[12] op b.f[12]; \ - f[13] op b.f[13]; \ - f[14] op b.f[14]; \ - f[15] op b.f[15]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16float& operator op( const v16float& b ) \ + { \ + f[0] op b.f[0]; \ + f[1] op b.f[1]; \ + f[2] op b.f[2]; \ + f[3] op b.f[3]; \ + f[4] op b.f[4]; \ + f[5] op b.f[5]; \ + f[6] op b.f[6]; \ + f[7] op b.f[7]; \ + f[8] op b.f[8]; \ + f[9] op b.f[9]; \ + f[10] op b.f[10]; \ + f[11] op b.f[11]; \ + f[12] op b.f[12]; \ + f[13] op b.f[13]; \ + f[14] op b.f[14]; \ + f[15] op b.f[15]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) -# undef ASSIGN +#undef ASSIGN // v16float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v16float prefix unary operators +// v16float prefix unary operators - inline v16float operator +( const v16float &a ) - { +inline v16float operator+( const v16float& a ) +{ v16float b; - b.f[ 0] = +a.f[ 0]; - b.f[ 1] = +a.f[ 1]; - b.f[ 2] = +a.f[ 2]; - b.f[ 3] = +a.f[ 3]; - b.f[ 4] = +a.f[ 4]; - b.f[ 5] = +a.f[ 5]; - b.f[ 6] = +a.f[ 6]; - b.f[ 7] = +a.f[ 7]; - b.f[ 8] = +a.f[ 8]; - b.f[ 9] = +a.f[ 9]; + b.f[0] = +a.f[0]; + b.f[1] = +a.f[1]; + b.f[2] = +a.f[2]; + b.f[3] = +a.f[3]; + b.f[4] = +a.f[4]; + b.f[5] = +a.f[5]; + b.f[6] = +a.f[6]; + b.f[7] = +a.f[7]; + b.f[8] = +a.f[8]; + b.f[9] = +a.f[9]; b.f[10] = +a.f[10]; b.f[11] = +a.f[11]; b.f[12] = +a.f[12]; @@ -3599,22 +3695,22 @@ namespace v16 b.f[15] = +a.f[15]; return b; - } +} - inline v16float operator -( const v16float &a ) - { +inline v16float operator-( const v16float& a ) +{ v16float b; - b.f[ 0] = -a.f[ 0]; - b.f[ 1] = -a.f[ 1]; - b.f[ 2] = -a.f[ 2]; - b.f[ 3] = -a.f[ 3]; - b.f[ 4] = -a.f[ 4]; - b.f[ 5] = -a.f[ 5]; - b.f[ 6] = -a.f[ 6]; - b.f[ 7] = -a.f[ 7]; - b.f[ 8] = -a.f[ 8]; - b.f[ 9] = -a.f[ 9]; + b.f[0] = -a.f[0]; + b.f[1] = -a.f[1]; + b.f[2] = -a.f[2]; + b.f[3] = -a.f[3]; + b.f[4] = -a.f[4]; + b.f[5] = -a.f[5]; + b.f[6] = -a.f[6]; + b.f[7] = -a.f[7]; + b.f[8] = -a.f[8]; + b.f[9] = -a.f[9]; b.f[10] = -a.f[10]; b.f[11] = -a.f[11]; b.f[12] = -a.f[12]; @@ -3623,22 +3719,22 @@ namespace v16 b.f[15] = -a.f[15]; return b; - } +} - inline v16int operator !( const v16float &a ) - { +inline v16int operator!( const v16float& a ) +{ v16int b; - b.i[ 0] = a.i[ 0] ? 0 : -1; - b.i[ 1] = a.i[ 1] ? 0 : -1; - b.i[ 2] = a.i[ 2] ? 0 : -1; - b.i[ 3] = a.i[ 3] ? 0 : -1; - b.i[ 4] = a.i[ 4] ? 0 : -1; - b.i[ 5] = a.i[ 5] ? 0 : -1; - b.i[ 6] = a.i[ 6] ? 0 : -1; - b.i[ 7] = a.i[ 7] ? 0 : -1; - b.i[ 8] = a.i[ 8] ? 0 : -1; - b.i[ 9] = a.i[ 9] ? 0 : -1; + b.i[0] = a.i[0] ? 0 : -1; + b.i[1] = a.i[1] ? 0 : -1; + b.i[2] = a.i[2] ? 0 : -1; + b.i[3] = a.i[3] ? 0 : -1; + b.i[4] = a.i[4] ? 0 : -1; + b.i[5] = a.i[5] ? 0 : -1; + b.i[6] = a.i[6] ? 0 : -1; + b.i[7] = a.i[7] ? 0 : -1; + b.i[8] = a.i[8] ? 0 : -1; + b.i[9] = a.i[9] ? 0 : -1; b.i[10] = a.i[10] ? 0 : -1; b.i[11] = a.i[11] ? 0 : -1; b.i[12] = a.i[12] ? 0 : -1; @@ -3647,24 +3743,24 @@ namespace v16 b.i[15] = a.i[15] ? 0 : -1; return b; - } +} - // v16float prefix increment / decrement operators +// v16float prefix increment / decrement operators - inline v16float operator ++( v16float &a ) - { +inline v16float operator++( v16float& a ) +{ v16float b; - b.f[ 0] = ++a.f[ 0]; - b.f[ 1] = ++a.f[ 1]; - b.f[ 2] = ++a.f[ 2]; - b.f[ 3] = ++a.f[ 3]; - b.f[ 4] = ++a.f[ 4]; - b.f[ 5] = ++a.f[ 5]; - b.f[ 6] = ++a.f[ 6]; - b.f[ 7] = ++a.f[ 7]; - b.f[ 8] = ++a.f[ 8]; - b.f[ 9] = ++a.f[ 9]; + b.f[0] = ++a.f[0]; + b.f[1] = ++a.f[1]; + b.f[2] = ++a.f[2]; + b.f[3] = ++a.f[3]; + b.f[4] = ++a.f[4]; + b.f[5] = ++a.f[5]; + b.f[6] = ++a.f[6]; + b.f[7] = ++a.f[7]; + b.f[8] = ++a.f[8]; + b.f[9] = ++a.f[9]; b.f[10] = ++a.f[10]; b.f[11] = ++a.f[11]; b.f[12] = ++a.f[12]; @@ -3673,22 +3769,22 @@ namespace v16 b.f[15] = ++a.f[15]; return b; - } +} - inline v16float operator --( v16float &a ) - { +inline v16float operator--( v16float& a ) +{ v16float b; - b.f[ 0] = --a.f[ 0]; - b.f[ 1] = --a.f[ 1]; - b.f[ 2] = --a.f[ 2]; - b.f[ 3] = --a.f[ 3]; - b.f[ 4] = --a.f[ 4]; - b.f[ 5] = --a.f[ 5]; - b.f[ 6] = --a.f[ 6]; - b.f[ 7] = --a.f[ 7]; - b.f[ 8] = --a.f[ 8]; - b.f[ 9] = --a.f[ 9]; + b.f[0] = --a.f[0]; + b.f[1] = --a.f[1]; + b.f[2] = --a.f[2]; + b.f[3] = --a.f[3]; + b.f[4] = --a.f[4]; + b.f[5] = --a.f[5]; + b.f[6] = --a.f[6]; + b.f[7] = --a.f[7]; + b.f[8] = --a.f[8]; + b.f[9] = --a.f[9]; b.f[10] = --a.f[10]; b.f[11] = --a.f[11]; b.f[12] = --a.f[12]; @@ -3697,24 +3793,24 @@ namespace v16 b.f[15] = --a.f[15]; return b; - } +} - // v16float postfix increment / decrement operators +// v16float postfix increment / decrement operators - inline v16float operator ++( v16float &a, int ) - { +inline v16float operator++( v16float& a, int ) +{ v16float b; - b.f[ 0] = a.f[ 0]++; - b.f[ 1] = a.f[ 1]++; - b.f[ 2] = a.f[ 2]++; - b.f[ 3] = a.f[ 3]++; - b.f[ 4] = a.f[ 4]++; - b.f[ 5] = a.f[ 5]++; - b.f[ 6] = a.f[ 6]++; - b.f[ 7] = a.f[ 7]++; - b.f[ 8] = a.f[ 8]++; - b.f[ 9] = a.f[ 9]++; + b.f[0] = a.f[0]++; + b.f[1] = a.f[1]++; + b.f[2] = a.f[2]++; + b.f[3] = a.f[3]++; + b.f[4] = a.f[4]++; + b.f[5] = a.f[5]++; + b.f[6] = a.f[6]++; + b.f[7] = a.f[7]++; + b.f[8] = a.f[8]++; + b.f[9] = a.f[9]++; b.f[10] = a.f[10]++; b.f[11] = a.f[11]++; b.f[12] = a.f[12]++; @@ -3723,22 +3819,22 @@ namespace v16 b.f[15] = a.f[15]++; return b; - } +} - inline v16float operator --( v16float &a, int ) - { +inline v16float operator--( v16float& a, int ) +{ v16float b; - b.f[ 0] = a.f[ 0]--; - b.f[ 1] = a.f[ 1]--; - b.f[ 2] = a.f[ 2]--; - b.f[ 3] = a.f[ 3]--; - b.f[ 4] = a.f[ 4]--; - b.f[ 5] = a.f[ 5]--; - b.f[ 6] = a.f[ 6]--; - b.f[ 7] = a.f[ 7]--; - b.f[ 8] = a.f[ 8]--; - b.f[ 9] = a.f[ 9]--; + b.f[0] = a.f[0]--; + b.f[1] = a.f[1]--; + b.f[2] = a.f[2]--; + b.f[3] = a.f[3]--; + b.f[4] = a.f[4]--; + b.f[5] = a.f[5]--; + b.f[6] = a.f[6]--; + b.f[7] = a.f[7]--; + b.f[8] = a.f[8]--; + b.f[9] = a.f[9]--; b.f[10] = a.f[10]--; b.f[11] = a.f[11]--; b.f[12] = a.f[12]--; @@ -3747,317 +3843,335 @@ namespace v16 b.f[15] = a.f[15]--; return b; - } - - // v16float binary operators - -# define BINARY(op) \ - inline v16float operator op( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - c.f[ 0] = a.f[ 0] op b.f[ 0]; \ - c.f[ 1] = a.f[ 1] op b.f[ 1]; \ - c.f[ 2] = a.f[ 2] op b.f[ 2]; \ - c.f[ 3] = a.f[ 3] op b.f[ 3]; \ - c.f[ 4] = a.f[ 4] op b.f[ 4]; \ - c.f[ 5] = a.f[ 5] op b.f[ 5]; \ - c.f[ 6] = a.f[ 6] op b.f[ 6]; \ - c.f[ 7] = a.f[ 7] op b.f[ 7]; \ - c.f[ 8] = a.f[ 8] op b.f[ 8]; \ - c.f[ 9] = a.f[ 9] op b.f[ 9]; \ - c.f[10] = a.f[10] op b.f[10]; \ - c.f[11] = a.f[11] op b.f[11]; \ - c.f[12] = a.f[12] op b.f[12]; \ - c.f[13] = a.f[13] op b.f[13]; \ - c.f[14] = a.f[14] op b.f[14]; \ - c.f[15] = a.f[15] op b.f[15]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - -# undef BINARY - - // v16float logical operators - -# define LOGICAL(op) \ - inline v16int operator op( const v16float &a, const v16float &b ) \ - { \ - v16int c; \ - c.i[ 0] = -( a.f[ 0] op b.f[ 0] ); \ - c.i[ 1] = -( a.f[ 1] op b.f[ 1] ); \ - c.i[ 2] = -( a.f[ 2] op b.f[ 2] ); \ - c.i[ 3] = -( a.f[ 3] op b.f[ 3] ); \ - c.i[ 4] = -( a.f[ 4] op b.f[ 4] ); \ - c.i[ 5] = -( a.f[ 5] op b.f[ 5] ); \ - c.i[ 6] = -( a.f[ 6] op b.f[ 6] ); \ - c.i[ 7] = -( a.f[ 7] op b.f[ 7] ); \ - c.i[ 8] = -( a.f[ 8] op b.f[ 8] ); \ - c.i[ 9] = -( a.f[ 9] op b.f[ 9] ); \ - c.i[10] = -( a.f[10] op b.f[10] ); \ - c.i[11] = -( a.f[11] op b.f[11] ); \ - c.i[12] = -( a.f[12] op b.f[12] ); \ - c.i[13] = -( a.f[13] op b.f[13] ); \ - c.i[14] = -( a.f[14] op b.f[14] ); \ - c.i[15] = -( a.f[15] op b.f[15] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v16float math library functions - -# define CMATH_FR1(fn) \ - inline v16float fn( const v16float &a ) \ - { \ - v16float b; \ - b.f[ 0] = ::fn( a.f[ 0] ); \ - b.f[ 1] = ::fn( a.f[ 1] ); \ - b.f[ 2] = ::fn( a.f[ 2] ); \ - b.f[ 3] = ::fn( a.f[ 3] ); \ - b.f[ 4] = ::fn( a.f[ 4] ); \ - b.f[ 5] = ::fn( a.f[ 5] ); \ - b.f[ 6] = ::fn( a.f[ 6] ); \ - b.f[ 7] = ::fn( a.f[ 7] ); \ - b.f[ 8] = ::fn( a.f[ 8] ); \ - b.f[ 9] = ::fn( a.f[ 9] ); \ - b.f[10] = ::fn( a.f[10] ); \ - b.f[11] = ::fn( a.f[11] ); \ - b.f[12] = ::fn( a.f[12] ); \ - b.f[13] = ::fn( a.f[13] ); \ - b.f[14] = ::fn( a.f[14] ); \ - b.f[15] = ::fn( a.f[15] ); \ - return b; \ - } - -# define CMATH_FR2(fn) \ - inline v16float fn( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - c.f[ 0] = ::fn( a.f[ 0], b.f[ 0] ); \ - c.f[ 1] = ::fn( a.f[ 1], b.f[ 1] ); \ - c.f[ 2] = ::fn( a.f[ 2], b.f[ 2] ); \ - c.f[ 3] = ::fn( a.f[ 3], b.f[ 3] ); \ - c.f[ 4] = ::fn( a.f[ 4], b.f[ 4] ); \ - c.f[ 5] = ::fn( a.f[ 5], b.f[ 5] ); \ - c.f[ 6] = ::fn( a.f[ 6], b.f[ 6] ); \ - c.f[ 7] = ::fn( a.f[ 7], b.f[ 7] ); \ - c.f[ 8] = ::fn( a.f[ 8], b.f[ 8] ); \ - c.f[ 9] = ::fn( a.f[ 9], b.f[ 9] ); \ - c.f[10] = ::fn( a.f[10], b.f[10] ); \ - c.f[11] = ::fn( a.f[11], b.f[11] ); \ - c.f[12] = ::fn( a.f[12], b.f[12] ); \ - c.f[13] = ::fn( a.f[13], b.f[13] ); \ - c.f[14] = ::fn( a.f[14], b.f[14] ); \ - c.f[15] = ::fn( a.f[15], b.f[15] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - inline v16float copysign( const v16float &a, const v16float &b ) - { - v16float c; - float t; - - t = ::fabs( a.f[ 0] ); - if( b.f[ 0] < 0 ) t = -t; - c.f[ 0] = t; - - t = ::fabs( a.f[ 1] ); - if( b.f[ 1] < 0 ) t = -t; - c.f[ 1] = t; - - t = ::fabs( a.f[ 2] ); - if( b.f[ 2] < 0 ) t = -t; - c.f[ 2] = t; - - t = ::fabs( a.f[ 3] ); - if( b.f[ 3] < 0 ) t = -t; - c.f[ 3] = t; +} + +// v16float binary operators + +#define BINARY( op ) \ + inline v16float operator op( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + c.f[0] = a.f[0] op b.f[0]; \ + c.f[1] = a.f[1] op b.f[1]; \ + c.f[2] = a.f[2] op b.f[2]; \ + c.f[3] = a.f[3] op b.f[3]; \ + c.f[4] = a.f[4] op b.f[4]; \ + c.f[5] = a.f[5] op b.f[5]; \ + c.f[6] = a.f[6] op b.f[6]; \ + c.f[7] = a.f[7] op b.f[7]; \ + c.f[8] = a.f[8] op b.f[8]; \ + c.f[9] = a.f[9] op b.f[9]; \ + c.f[10] = a.f[10] op b.f[10]; \ + c.f[11] = a.f[11] op b.f[11]; \ + c.f[12] = a.f[12] op b.f[12]; \ + c.f[13] = a.f[13] op b.f[13]; \ + c.f[14] = a.f[14] op b.f[14]; \ + c.f[15] = a.f[15] op b.f[15]; \ + return c; \ + } - t = ::fabs( a.f[ 4] ); - if( b.f[ 4] < 0 ) t = -t; - c.f[ 4] = t; +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v16float logical operators + +#define LOGICAL( op ) \ + inline v16int operator op( const v16float& a, const v16float& b ) \ + { \ + v16int c; \ + c.i[0] = -( a.f[0] op b.f[0] ); \ + c.i[1] = -( a.f[1] op b.f[1] ); \ + c.i[2] = -( a.f[2] op b.f[2] ); \ + c.i[3] = -( a.f[3] op b.f[3] ); \ + c.i[4] = -( a.f[4] op b.f[4] ); \ + c.i[5] = -( a.f[5] op b.f[5] ); \ + c.i[6] = -( a.f[6] op b.f[6] ); \ + c.i[7] = -( a.f[7] op b.f[7] ); \ + c.i[8] = -( a.f[8] op b.f[8] ); \ + c.i[9] = -( a.f[9] op b.f[9] ); \ + c.i[10] = -( a.f[10] op b.f[10] ); \ + c.i[11] = -( a.f[11] op b.f[11] ); \ + c.i[12] = -( a.f[12] op b.f[12] ); \ + c.i[13] = -( a.f[13] op b.f[13] ); \ + c.i[14] = -( a.f[14] op b.f[14] ); \ + c.i[15] = -( a.f[15] op b.f[15] ); \ + return c; \ + } - t = ::fabs( a.f[ 5] ); - if( b.f[ 5] < 0 ) t = -t; - c.f[ 5] = t; +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v16float math library functions + +#define CMATH_FR1( fn ) \ + inline v16float fn( const v16float& a ) \ + { \ + v16float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + b.f[4] = ::fn( a.f[4] ); \ + b.f[5] = ::fn( a.f[5] ); \ + b.f[6] = ::fn( a.f[6] ); \ + b.f[7] = ::fn( a.f[7] ); \ + b.f[8] = ::fn( a.f[8] ); \ + b.f[9] = ::fn( a.f[9] ); \ + b.f[10] = ::fn( a.f[10] ); \ + b.f[11] = ::fn( a.f[11] ); \ + b.f[12] = ::fn( a.f[12] ); \ + b.f[13] = ::fn( a.f[13] ); \ + b.f[14] = ::fn( a.f[14] ); \ + b.f[15] = ::fn( a.f[15] ); \ + return b; \ + } - t = ::fabs( a.f[ 6] ); - if( b.f[ 6] < 0 ) t = -t; - c.f[ 6] = t; +#define CMATH_FR2( fn ) \ + inline v16float fn( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + c.f[4] = ::fn( a.f[4], b.f[4] ); \ + c.f[5] = ::fn( a.f[5], b.f[5] ); \ + c.f[6] = ::fn( a.f[6], b.f[6] ); \ + c.f[7] = ::fn( a.f[7], b.f[7] ); \ + c.f[8] = ::fn( a.f[8], b.f[8] ); \ + c.f[9] = ::fn( a.f[9], b.f[9] ); \ + c.f[10] = ::fn( a.f[10], b.f[10] ); \ + c.f[11] = ::fn( a.f[11], b.f[11] ); \ + c.f[12] = ::fn( a.f[12], b.f[12] ); \ + c.f[13] = ::fn( a.f[13], b.f[13] ); \ + c.f[14] = ::fn( a.f[14], b.f[14] ); \ + c.f[15] = ::fn( a.f[15], b.f[15] ); \ + return c; \ + } - t = ::fabs( a.f[ 7] ); - if( b.f[ 7] < 0 ) t = -t; - c.f[ 7] = t; +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) - t = ::fabs( a.f[ 8] ); - if( b.f[ 8] < 0 ) t = -t; - c.f[ 8] = t; + inline v16float + copysign( const v16float& a, const v16float& b ) +{ + v16float c; + float t; - t = ::fabs( a.f[ 9] ); - if( b.f[ 9] < 0 ) t = -t; - c.f[ 9] = t; + t = ::fabs( a.f[0] ); + if ( b.f[0] < 0 ) + t = -t; + c.f[0] = t; + + t = ::fabs( a.f[1] ); + if ( b.f[1] < 0 ) + t = -t; + c.f[1] = t; + + t = ::fabs( a.f[2] ); + if ( b.f[2] < 0 ) + t = -t; + c.f[2] = t; + + t = ::fabs( a.f[3] ); + if ( b.f[3] < 0 ) + t = -t; + c.f[3] = t; + + t = ::fabs( a.f[4] ); + if ( b.f[4] < 0 ) + t = -t; + c.f[4] = t; + + t = ::fabs( a.f[5] ); + if ( b.f[5] < 0 ) + t = -t; + c.f[5] = t; + + t = ::fabs( a.f[6] ); + if ( b.f[6] < 0 ) + t = -t; + c.f[6] = t; + + t = ::fabs( a.f[7] ); + if ( b.f[7] < 0 ) + t = -t; + c.f[7] = t; + + t = ::fabs( a.f[8] ); + if ( b.f[8] < 0 ) + t = -t; + c.f[8] = t; + + t = ::fabs( a.f[9] ); + if ( b.f[9] < 0 ) + t = -t; + c.f[9] = t; t = ::fabs( a.f[10] ); - if( b.f[10] < 0 ) t = -t; + if ( b.f[10] < 0 ) + t = -t; c.f[10] = t; t = ::fabs( a.f[11] ); - if( b.f[11] < 0 ) t = -t; + if ( b.f[11] < 0 ) + t = -t; c.f[11] = t; t = ::fabs( a.f[12] ); - if( b.f[12] < 0 ) t = -t; + if ( b.f[12] < 0 ) + t = -t; c.f[12] = t; t = ::fabs( a.f[13] ); - if( b.f[13] < 0 ) t = -t; + if ( b.f[13] < 0 ) + t = -t; c.f[13] = t; t = ::fabs( a.f[14] ); - if( b.f[14] < 0 ) t = -t; + if ( b.f[14] < 0 ) + t = -t; c.f[14] = t; t = ::fabs( a.f[15] ); - if( b.f[15] < 0 ) t = -t; + if ( b.f[15] < 0 ) + t = -t; c.f[15] = t; return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v16float miscellaneous functions +// v16float miscellaneous functions - inline v16float rsqrt_approx( const v16float &a ) - { +inline v16float rsqrt_approx( const v16float& a ) +{ v16float b; - b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] ); - b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] ); - b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] ); - b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] ); - b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] ); - b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] ); - b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] ); - b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] ); - b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] ); - b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] ); - b.f[10] = ::sqrt( 1.0f/a.f[10] ); - b.f[11] = ::sqrt( 1.0f/a.f[11] ); - b.f[12] = ::sqrt( 1.0f/a.f[12] ); - b.f[13] = ::sqrt( 1.0f/a.f[13] ); - b.f[14] = ::sqrt( 1.0f/a.f[14] ); - b.f[15] = ::sqrt( 1.0f/a.f[15] ); + b.f[0] = ::sqrt( 1.0f / a.f[0] ); + b.f[1] = ::sqrt( 1.0f / a.f[1] ); + b.f[2] = ::sqrt( 1.0f / a.f[2] ); + b.f[3] = ::sqrt( 1.0f / a.f[3] ); + b.f[4] = ::sqrt( 1.0f / a.f[4] ); + b.f[5] = ::sqrt( 1.0f / a.f[5] ); + b.f[6] = ::sqrt( 1.0f / a.f[6] ); + b.f[7] = ::sqrt( 1.0f / a.f[7] ); + b.f[8] = ::sqrt( 1.0f / a.f[8] ); + b.f[9] = ::sqrt( 1.0f / a.f[9] ); + b.f[10] = ::sqrt( 1.0f / a.f[10] ); + b.f[11] = ::sqrt( 1.0f / a.f[11] ); + b.f[12] = ::sqrt( 1.0f / a.f[12] ); + b.f[13] = ::sqrt( 1.0f / a.f[13] ); + b.f[14] = ::sqrt( 1.0f / a.f[14] ); + b.f[15] = ::sqrt( 1.0f / a.f[15] ); return b; - } +} - inline v16float rsqrt( const v16float &a ) - { +inline v16float rsqrt( const v16float& a ) +{ v16float b; - b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] ); - b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] ); - b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] ); - b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] ); - b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] ); - b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] ); - b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] ); - b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] ); - b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] ); - b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] ); - b.f[10] = ::sqrt( 1.0f/a.f[10] ); - b.f[11] = ::sqrt( 1.0f/a.f[11] ); - b.f[12] = ::sqrt( 1.0f/a.f[12] ); - b.f[13] = ::sqrt( 1.0f/a.f[13] ); - b.f[14] = ::sqrt( 1.0f/a.f[14] ); - b.f[15] = ::sqrt( 1.0f/a.f[15] ); + b.f[0] = ::sqrt( 1.0f / a.f[0] ); + b.f[1] = ::sqrt( 1.0f / a.f[1] ); + b.f[2] = ::sqrt( 1.0f / a.f[2] ); + b.f[3] = ::sqrt( 1.0f / a.f[3] ); + b.f[4] = ::sqrt( 1.0f / a.f[4] ); + b.f[5] = ::sqrt( 1.0f / a.f[5] ); + b.f[6] = ::sqrt( 1.0f / a.f[6] ); + b.f[7] = ::sqrt( 1.0f / a.f[7] ); + b.f[8] = ::sqrt( 1.0f / a.f[8] ); + b.f[9] = ::sqrt( 1.0f / a.f[9] ); + b.f[10] = ::sqrt( 1.0f / a.f[10] ); + b.f[11] = ::sqrt( 1.0f / a.f[11] ); + b.f[12] = ::sqrt( 1.0f / a.f[12] ); + b.f[13] = ::sqrt( 1.0f / a.f[13] ); + b.f[14] = ::sqrt( 1.0f / a.f[14] ); + b.f[15] = ::sqrt( 1.0f / a.f[15] ); return b; - } +} - inline v16float rcp_approx( const v16float &a ) - { +inline v16float rcp_approx( const v16float& a ) +{ v16float b; - b.f[ 0] = 1.0f/a.f[ 0]; - b.f[ 1] = 1.0f/a.f[ 1]; - b.f[ 2] = 1.0f/a.f[ 2]; - b.f[ 3] = 1.0f/a.f[ 3]; - b.f[ 4] = 1.0f/a.f[ 4]; - b.f[ 5] = 1.0f/a.f[ 5]; - b.f[ 6] = 1.0f/a.f[ 6]; - b.f[ 7] = 1.0f/a.f[ 7]; - b.f[ 8] = 1.0f/a.f[ 8]; - b.f[ 9] = 1.0f/a.f[ 9]; - b.f[10] = 1.0f/a.f[10]; - b.f[11] = 1.0f/a.f[11]; - b.f[12] = 1.0f/a.f[12]; - b.f[13] = 1.0f/a.f[13]; - b.f[14] = 1.0f/a.f[14]; - b.f[15] = 1.0f/a.f[15]; + b.f[0] = 1.0f / a.f[0]; + b.f[1] = 1.0f / a.f[1]; + b.f[2] = 1.0f / a.f[2]; + b.f[3] = 1.0f / a.f[3]; + b.f[4] = 1.0f / a.f[4]; + b.f[5] = 1.0f / a.f[5]; + b.f[6] = 1.0f / a.f[6]; + b.f[7] = 1.0f / a.f[7]; + b.f[8] = 1.0f / a.f[8]; + b.f[9] = 1.0f / a.f[9]; + b.f[10] = 1.0f / a.f[10]; + b.f[11] = 1.0f / a.f[11]; + b.f[12] = 1.0f / a.f[12]; + b.f[13] = 1.0f / a.f[13]; + b.f[14] = 1.0f / a.f[14]; + b.f[15] = 1.0f / a.f[15]; return b; - } +} - inline v16float rcp( const v16float &a ) - { +inline v16float rcp( const v16float& a ) +{ v16float b; - b.f[ 0] = 1.0f/a.f[ 0]; - b.f[ 1] = 1.0f/a.f[ 1]; - b.f[ 2] = 1.0f/a.f[ 2]; - b.f[ 3] = 1.0f/a.f[ 3]; - b.f[ 4] = 1.0f/a.f[ 4]; - b.f[ 5] = 1.0f/a.f[ 5]; - b.f[ 6] = 1.0f/a.f[ 6]; - b.f[ 7] = 1.0f/a.f[ 7]; - b.f[ 8] = 1.0f/a.f[ 8]; - b.f[ 9] = 1.0f/a.f[ 9]; - b.f[10] = 1.0f/a.f[10]; - b.f[11] = 1.0f/a.f[11]; - b.f[12] = 1.0f/a.f[12]; - b.f[13] = 1.0f/a.f[13]; - b.f[14] = 1.0f/a.f[14]; - b.f[15] = 1.0f/a.f[15]; + b.f[0] = 1.0f / a.f[0]; + b.f[1] = 1.0f / a.f[1]; + b.f[2] = 1.0f / a.f[2]; + b.f[3] = 1.0f / a.f[3]; + b.f[4] = 1.0f / a.f[4]; + b.f[5] = 1.0f / a.f[5]; + b.f[6] = 1.0f / a.f[6]; + b.f[7] = 1.0f / a.f[7]; + b.f[8] = 1.0f / a.f[8]; + b.f[9] = 1.0f / a.f[9]; + b.f[10] = 1.0f / a.f[10]; + b.f[11] = 1.0f / a.f[11]; + b.f[12] = 1.0f / a.f[12]; + b.f[13] = 1.0f / a.f[13]; + b.f[14] = 1.0f / a.f[14]; + b.f[15] = 1.0f / a.f[15]; return b; - } +} - inline v16float fma( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fma( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; - d.f[ 0] = a.f[ 0] * b.f[ 0] + c.f[ 0]; - d.f[ 1] = a.f[ 1] * b.f[ 1] + c.f[ 1]; - d.f[ 2] = a.f[ 2] * b.f[ 2] + c.f[ 2]; - d.f[ 3] = a.f[ 3] * b.f[ 3] + c.f[ 3]; - d.f[ 4] = a.f[ 4] * b.f[ 4] + c.f[ 4]; - d.f[ 5] = a.f[ 5] * b.f[ 5] + c.f[ 5]; - d.f[ 6] = a.f[ 6] * b.f[ 6] + c.f[ 6]; - d.f[ 7] = a.f[ 7] * b.f[ 7] + c.f[ 7]; - d.f[ 8] = a.f[ 8] * b.f[ 8] + c.f[ 8]; - d.f[ 9] = a.f[ 9] * b.f[ 9] + c.f[ 9]; + d.f[0] = a.f[0] * b.f[0] + c.f[0]; + d.f[1] = a.f[1] * b.f[1] + c.f[1]; + d.f[2] = a.f[2] * b.f[2] + c.f[2]; + d.f[3] = a.f[3] * b.f[3] + c.f[3]; + d.f[4] = a.f[4] * b.f[4] + c.f[4]; + d.f[5] = a.f[5] * b.f[5] + c.f[5]; + d.f[6] = a.f[6] * b.f[6] + c.f[6]; + d.f[7] = a.f[7] * b.f[7] + c.f[7]; + d.f[8] = a.f[8] * b.f[8] + c.f[8]; + d.f[9] = a.f[9] * b.f[9] + c.f[9]; d.f[10] = a.f[10] * b.f[10] + c.f[10]; d.f[11] = a.f[11] * b.f[11] + c.f[11]; d.f[12] = a.f[12] * b.f[12] + c.f[12]; @@ -4066,22 +4180,22 @@ namespace v16 d.f[15] = a.f[15] * b.f[15] + c.f[15]; return d; - } +} - inline v16float fms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; - d.f[ 0] = a.f[ 0] * b.f[ 0] - c.f[ 0]; - d.f[ 1] = a.f[ 1] * b.f[ 1] - c.f[ 1]; - d.f[ 2] = a.f[ 2] * b.f[ 2] - c.f[ 2]; - d.f[ 3] = a.f[ 3] * b.f[ 3] - c.f[ 3]; - d.f[ 4] = a.f[ 4] * b.f[ 4] - c.f[ 4]; - d.f[ 5] = a.f[ 5] * b.f[ 5] - c.f[ 5]; - d.f[ 6] = a.f[ 6] * b.f[ 6] - c.f[ 6]; - d.f[ 7] = a.f[ 7] * b.f[ 7] - c.f[ 7]; - d.f[ 8] = a.f[ 8] * b.f[ 8] - c.f[ 8]; - d.f[ 9] = a.f[ 9] * b.f[ 9] - c.f[ 9]; + d.f[0] = a.f[0] * b.f[0] - c.f[0]; + d.f[1] = a.f[1] * b.f[1] - c.f[1]; + d.f[2] = a.f[2] * b.f[2] - c.f[2]; + d.f[3] = a.f[3] * b.f[3] - c.f[3]; + d.f[4] = a.f[4] * b.f[4] - c.f[4]; + d.f[5] = a.f[5] * b.f[5] - c.f[5]; + d.f[6] = a.f[6] * b.f[6] - c.f[6]; + d.f[7] = a.f[7] * b.f[7] - c.f[7]; + d.f[8] = a.f[8] * b.f[8] - c.f[8]; + d.f[9] = a.f[9] * b.f[9] - c.f[9]; d.f[10] = a.f[10] * b.f[10] - c.f[10]; d.f[11] = a.f[11] * b.f[11] - c.f[11]; d.f[12] = a.f[12] * b.f[12] - c.f[12]; @@ -4090,22 +4204,22 @@ namespace v16 d.f[15] = a.f[15] * b.f[15] - c.f[15]; return d; - } +} - inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fnms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; - d.f[ 0] = c.f[ 0] - a.f[ 0] * b.f[ 0]; - d.f[ 1] = c.f[ 1] - a.f[ 1] * b.f[ 1]; - d.f[ 2] = c.f[ 2] - a.f[ 2] * b.f[ 2]; - d.f[ 3] = c.f[ 3] - a.f[ 3] * b.f[ 3]; - d.f[ 4] = c.f[ 4] - a.f[ 4] * b.f[ 4]; - d.f[ 5] = c.f[ 5] - a.f[ 5] * b.f[ 5]; - d.f[ 6] = c.f[ 6] - a.f[ 6] * b.f[ 6]; - d.f[ 7] = c.f[ 7] - a.f[ 7] * b.f[ 7]; - d.f[ 8] = c.f[ 8] - a.f[ 8] * b.f[ 8]; - d.f[ 9] = c.f[ 9] - a.f[ 9] * b.f[ 9]; + d.f[0] = c.f[0] - a.f[0] * b.f[0]; + d.f[1] = c.f[1] - a.f[1] * b.f[1]; + d.f[2] = c.f[2] - a.f[2] * b.f[2]; + d.f[3] = c.f[3] - a.f[3] * b.f[3]; + d.f[4] = c.f[4] - a.f[4] * b.f[4]; + d.f[5] = c.f[5] - a.f[5] * b.f[5]; + d.f[6] = c.f[6] - a.f[6] * b.f[6]; + d.f[7] = c.f[7] - a.f[7] * b.f[7]; + d.f[8] = c.f[8] - a.f[8] * b.f[8]; + d.f[9] = c.f[9] - a.f[9] * b.f[9]; d.f[10] = c.f[10] - a.f[10] * b.f[10]; d.f[11] = c.f[11] - a.f[11] * b.f[11]; d.f[12] = c.f[12] - a.f[12] * b.f[12]; @@ -4114,22 +4228,22 @@ namespace v16 d.f[15] = c.f[15] - a.f[15] * b.f[15]; return d; - } +} - inline v16float clear_bits( const v16int &m, const v16float &a ) - { +inline v16float clear_bits( const v16int& m, const v16float& a ) +{ v16float b; - b.i[ 0] = ( ~m.i[ 0] ) & a.i[ 0]; - b.i[ 1] = ( ~m.i[ 1] ) & a.i[ 1]; - b.i[ 2] = ( ~m.i[ 2] ) & a.i[ 2]; - b.i[ 3] = ( ~m.i[ 3] ) & a.i[ 3]; - b.i[ 4] = ( ~m.i[ 4] ) & a.i[ 4]; - b.i[ 5] = ( ~m.i[ 5] ) & a.i[ 5]; - b.i[ 6] = ( ~m.i[ 6] ) & a.i[ 6]; - b.i[ 7] = ( ~m.i[ 7] ) & a.i[ 7]; - b.i[ 8] = ( ~m.i[ 8] ) & a.i[ 8]; - b.i[ 9] = ( ~m.i[ 9] ) & a.i[ 9]; + b.i[0] = ( ~m.i[0] ) & a.i[0]; + b.i[1] = ( ~m.i[1] ) & a.i[1]; + b.i[2] = ( ~m.i[2] ) & a.i[2]; + b.i[3] = ( ~m.i[3] ) & a.i[3]; + b.i[4] = ( ~m.i[4] ) & a.i[4]; + b.i[5] = ( ~m.i[5] ) & a.i[5]; + b.i[6] = ( ~m.i[6] ) & a.i[6]; + b.i[7] = ( ~m.i[7] ) & a.i[7]; + b.i[8] = ( ~m.i[8] ) & a.i[8]; + b.i[9] = ( ~m.i[9] ) & a.i[9]; b.i[10] = ( ~m.i[10] ) & a.i[10]; b.i[11] = ( ~m.i[11] ) & a.i[11]; b.i[12] = ( ~m.i[12] ) & a.i[12]; @@ -4138,22 +4252,22 @@ namespace v16 b.i[15] = ( ~m.i[15] ) & a.i[15]; return b; - } +} - inline v16float set_bits( const v16int &m, const v16float &a ) - { +inline v16float set_bits( const v16int& m, const v16float& a ) +{ v16float b; - b.i[ 0] = m.i[ 0] | a.i[ 0]; - b.i[ 1] = m.i[ 1] | a.i[ 1]; - b.i[ 2] = m.i[ 2] | a.i[ 2]; - b.i[ 3] = m.i[ 3] | a.i[ 3]; - b.i[ 4] = m.i[ 4] | a.i[ 4]; - b.i[ 5] = m.i[ 5] | a.i[ 5]; - b.i[ 6] = m.i[ 6] | a.i[ 6]; - b.i[ 7] = m.i[ 7] | a.i[ 7]; - b.i[ 8] = m.i[ 8] | a.i[ 8]; - b.i[ 9] = m.i[ 9] | a.i[ 9]; + b.i[0] = m.i[0] | a.i[0]; + b.i[1] = m.i[1] | a.i[1]; + b.i[2] = m.i[2] | a.i[2]; + b.i[3] = m.i[3] | a.i[3]; + b.i[4] = m.i[4] | a.i[4]; + b.i[5] = m.i[5] | a.i[5]; + b.i[6] = m.i[6] | a.i[6]; + b.i[7] = m.i[7] | a.i[7]; + b.i[8] = m.i[8] | a.i[8]; + b.i[9] = m.i[9] | a.i[9]; b.i[10] = m.i[10] | a.i[10]; b.i[11] = m.i[11] | a.i[11]; b.i[12] = m.i[12] | a.i[12]; @@ -4162,22 +4276,22 @@ namespace v16 b.i[15] = m.i[15] | a.i[15]; return b; - } +} - inline v16float toggle_bits( const v16int &m, const v16float &a ) - { +inline v16float toggle_bits( const v16int& m, const v16float& a ) +{ v16float b; - b.i[ 0] = m.i[ 0] ^ a.i[ 0]; - b.i[ 1] = m.i[ 1] ^ a.i[ 1]; - b.i[ 2] = m.i[ 2] ^ a.i[ 2]; - b.i[ 3] = m.i[ 3] ^ a.i[ 3]; - b.i[ 4] = m.i[ 4] ^ a.i[ 4]; - b.i[ 5] = m.i[ 5] ^ a.i[ 5]; - b.i[ 6] = m.i[ 6] ^ a.i[ 6]; - b.i[ 7] = m.i[ 7] ^ a.i[ 7]; - b.i[ 8] = m.i[ 8] ^ a.i[ 8]; - b.i[ 9] = m.i[ 9] ^ a.i[ 9]; + b.i[0] = m.i[0] ^ a.i[0]; + b.i[1] = m.i[1] ^ a.i[1]; + b.i[2] = m.i[2] ^ a.i[2]; + b.i[3] = m.i[3] ^ a.i[3]; + b.i[4] = m.i[4] ^ a.i[4]; + b.i[5] = m.i[5] ^ a.i[5]; + b.i[6] = m.i[6] ^ a.i[6]; + b.i[7] = m.i[7] ^ a.i[7]; + b.i[8] = m.i[8] ^ a.i[8]; + b.i[9] = m.i[9] ^ a.i[9]; b.i[10] = m.i[10] ^ a.i[10]; b.i[11] = m.i[11] ^ a.i[11]; b.i[12] = m.i[12] ^ a.i[12]; @@ -4186,67 +4300,67 @@ namespace v16 b.i[15] = m.i[15] ^ a.i[15]; return b; - } - - inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) - { - p[ 0] += a.f[ 0]; - p[ 1] += a.f[ 1]; - p[ 2] += a.f[ 2]; - p[ 3] += a.f[ 3]; - p[ 4] += a.f[ 4]; - p[ 5] += a.f[ 5]; - p[ 6] += a.f[ 6]; - p[ 7] += a.f[ 7]; - p[ 8] += a.f[ 8]; - p[ 9] += a.f[ 9]; +} + +inline void increment_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ + p[0] += a.f[0]; + p[1] += a.f[1]; + p[2] += a.f[2]; + p[3] += a.f[3]; + p[4] += a.f[4]; + p[5] += a.f[5]; + p[6] += a.f[6]; + p[7] += a.f[7]; + p[8] += a.f[8]; + p[9] += a.f[9]; p[10] += a.f[10]; p[11] += a.f[11]; p[12] += a.f[12]; p[13] += a.f[13]; p[14] += a.f[14]; p[15] += a.f[15]; - } - - inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) - { - p[ 0] -= a.f[ 0]; - p[ 1] -= a.f[ 1]; - p[ 2] -= a.f[ 2]; - p[ 3] -= a.f[ 3]; - p[ 4] -= a.f[ 4]; - p[ 5] -= a.f[ 5]; - p[ 6] -= a.f[ 6]; - p[ 7] -= a.f[ 7]; - p[ 8] -= a.f[ 8]; - p[ 9] -= a.f[ 9]; +} + +inline void decrement_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ + p[0] -= a.f[0]; + p[1] -= a.f[1]; + p[2] -= a.f[2]; + p[3] -= a.f[3]; + p[4] -= a.f[4]; + p[5] -= a.f[5]; + p[6] -= a.f[6]; + p[7] -= a.f[7]; + p[8] -= a.f[8]; + p[9] -= a.f[9]; p[10] -= a.f[10]; p[11] -= a.f[11]; p[12] -= a.f[12]; p[13] -= a.f[13]; p[14] -= a.f[14]; p[15] -= a.f[15]; - } - - inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) - { - p[ 0] *= a.f[ 0]; - p[ 1] *= a.f[ 1]; - p[ 2] *= a.f[ 2]; - p[ 3] *= a.f[ 3]; - p[ 4] *= a.f[ 4]; - p[ 5] *= a.f[ 5]; - p[ 6] *= a.f[ 6]; - p[ 7] *= a.f[ 7]; - p[ 8] *= a.f[ 8]; - p[ 9] *= a.f[ 9]; +} + +inline void scale_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ + p[0] *= a.f[0]; + p[1] *= a.f[1]; + p[2] *= a.f[2]; + p[3] *= a.f[3]; + p[4] *= a.f[4]; + p[5] *= a.f[5]; + p[6] *= a.f[6]; + p[7] *= a.f[7]; + p[8] *= a.f[8]; + p[9] *= a.f[9]; p[10] *= a.f[10]; p[11] *= a.f[11]; p[12] *= a.f[12]; p[13] *= a.f[13]; p[14] *= a.f[14]; p[15] *= a.f[15]; - } +} } // namespace v16 diff --git a/src/util/v16/v16_portable_v1.h b/src/util/v16/v16_portable_v1.h index 5f798341..b181764d 100644 --- a/src/util/v16/v16_portable_v1.h +++ b/src/util/v16/v16_portable_v1.h @@ -11,7 +11,7 @@ #include #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif // This does not work with gcc 5.3.1 and the -fopenmp-simd @@ -22,396 +22,294 @@ // #define ALWAYS_VECTORIZE _Pragma( "simd" ) -#define ALWAYS_VECTORIZE \ - _Pragma( "simd" ) \ - _Pragma( "vector aligned" ) +#define ALWAYS_VECTORIZE _Pragma( "simd" ) _Pragma( "vector aligned" ) -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v16 { - class v16; - class v16int; - class v16float; +class v16; +class v16int; +class v16float; - //////////////// - // v16 base class +//////////////// +// v16 base class - class v16 - { +class v16 +{ friend class v16int; friend class v16float; // v16 miscellaneous friends - friend inline int any( const v16 &a ) ALWAYS_INLINE; - friend inline int all( const v16 &a ) ALWAYS_INLINE; + friend inline int any( const v16& a ) ALWAYS_INLINE; + friend inline int all( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 splat( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 splat( const v16& a ) ALWAYS_INLINE; - template - friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE; + template + friend inline v16 shuffle( const v16& a ) ALWAYS_INLINE; - friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE; + friend inline void swap( v16& a, v16& b ) ALWAYS_INLINE; + friend inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, + v16& a04, v16& a05, v16& a06, v16& a07, + v16& a08, v16& a09, v16& a10, v16& a11, + v16& a12, v16& a13, v16& a14, + v16& a15 ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& a, + const v16& b ) ALWAYS_INLINE; // v16 memory manipulation friends - friend inline void load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE; - friend inline void store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE; - friend inline void clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE; - friend inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) ALWAYS_INLINE; - friend inline void swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE; + friend inline void load_16x1( const void* ALIGNED( 64 ) p, + v16& a ) ALWAYS_INLINE; + friend inline void store_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void stream_16x1( const v16& a, + void* ALIGNED( 64 ) p ) ALWAYS_INLINE; + friend inline void clear_16x1( void* ALIGNED( 64 ) dst ) ALWAYS_INLINE; + friend inline void copy_16x1( void* ALIGNED( 64 ) dst, + const void* ALIGNED( 64 ) src ) ALWAYS_INLINE; + friend inline void swap_16x1( void* ALIGNED( 64 ) a, + void* ALIGNED( 64 ) b ) ALWAYS_INLINE; // v16 transposed memory manipulation friends // Note: Half aligned values are permissible in the 16x2_tr variants. - friend inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) ALWAYS_INLINE; - friend inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) ALWAYS_INLINE; - friend inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE; - friend inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE; - friend inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE; - friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE; - - friend inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE; - friend inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, - void * ALIGNED(8) a01, - void * ALIGNED(8) a02, - void * ALIGNED(8) a03, - void * ALIGNED(8) a04, - void * ALIGNED(8) a05, - void * ALIGNED(8) a06, - void * ALIGNED(8) a07, - void * ALIGNED(8) a08, - void * ALIGNED(8) a09, - void * ALIGNED(8) a10, - void * ALIGNED(8) a11, - void * ALIGNED(8) a12, - void * ALIGNED(8) a13, - void * ALIGNED(8) a14, - void * ALIGNED(8) a15 ) ALWAYS_INLINE; - friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x4_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x16_tr( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; - friend inline void store_16x8_tr_p( const v16 &a, const v16 &b, - const v16 &c, const v16 &d, - const v16 &e, const v16 &f, - const v16 &g, const v16 &h, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) ALWAYS_INLINE; - friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, - const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, - const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, - const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, - const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07, - void * ALIGNED(64) a08, - void * ALIGNED(64) a09, - void * ALIGNED(64) a10, - void * ALIGNED(64) a11, - void * ALIGNED(64) a12, - void * ALIGNED(64) a13, - void * ALIGNED(64) a14, - void * ALIGNED(64) a15 ) ALWAYS_INLINE; + friend inline void + load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) ALWAYS_INLINE; + friend inline void + load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) ALWAYS_INLINE; + friend inline void + load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) ALWAYS_INLINE; + friend inline void + load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) ALWAYS_INLINE; + friend inline void + load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, + v16& h ) ALWAYS_INLINE; + friend inline void + load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) ALWAYS_INLINE; + friend inline void load_16x8_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, v16& a, + v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) ALWAYS_INLINE; + friend inline void load_16x16_tr_p( + const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, v16& b00, + v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, v16& b06, v16& b07, + v16& b08, v16& b09, v16& b10, v16& b11, v16& b12, v16& b13, v16& b14, + v16& b15 ) ALWAYS_INLINE; + + friend inline void store_16x1_tr( const v16& a, void* a00, void* a01, + void* a02, void* a03, void* a04, + void* a05, void* a06, void* a07, + void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, + void* a14, void* a15 ) ALWAYS_INLINE; + friend inline void store_16x2_tr( + const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, void* ALIGNED( 8 ) a03, + void* ALIGNED( 8 ) a04, void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, void* ALIGNED( 8 ) a09, + void* ALIGNED( 8 ) a10, void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x4_tr( const v16& a, const v16& b, const v16& c, const v16& d, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; + friend inline void + store_16x8_tr_p( const v16& a, const v16& b, const v16& c, const v16& d, + const v16& e, const v16& f, const v16& g, const v16& h, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07 ) ALWAYS_INLINE; + friend inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) ALWAYS_INLINE; protected: - - union - { - int i[16]; - float f[16]; + union { + int i[16]; + float f[16]; }; public: + v16() {} // Default constructor - v16() {} // Default constructor - - v16( const v16 &a ) // Copy constructor + v16( const v16& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + i[j] = a.i[j]; } - ~v16() {} // Default destructor - }; - - // v16 miscellaneous functions - - inline int any( const v16 &a ) - { - return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] || - a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] || - a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] || - a.i[12] || a.i[13] || a.i[14] || a.i[15]; - } - - inline int all( const v16 &a ) - { - return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] && - a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] && - a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] && - a.i[12] && a.i[13] && a.i[14] && a.i[15]; - } - - template - inline v16 splat( const v16 & a ) - { + ~v16() {} // Default destructor +}; + +// v16 miscellaneous functions + +inline int any( const v16& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7] || a.i[8] || a.i[9] || a.i[10] || a.i[11] || a.i[12] || + a.i[13] || a.i[14] || a.i[15]; +} + +inline int all( const v16& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7] && a.i[8] && a.i[9] && a.i[10] && a.i[11] && a.i[12] && + a.i[13] && a.i[14] && a.i[15]; +} + +template +inline v16 splat( const v16& a ) +{ v16 b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = a.i[n]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = a.i[n]; return b; - } +} - template - inline v16 shuffle( const v16 & a ) - { +template +inline v16 shuffle( const v16& a ) +{ v16 b; - b.i[ 0] = a.i[i00]; - b.i[ 1] = a.i[i01]; - b.i[ 2] = a.i[i02]; - b.i[ 3] = a.i[i03]; - b.i[ 4] = a.i[i04]; - b.i[ 5] = a.i[i05]; - b.i[ 6] = a.i[i06]; - b.i[ 7] = a.i[i07]; - b.i[ 8] = a.i[i08]; - b.i[ 9] = a.i[i09]; + b.i[0] = a.i[i00]; + b.i[1] = a.i[i01]; + b.i[2] = a.i[i02]; + b.i[3] = a.i[i03]; + b.i[4] = a.i[i04]; + b.i[5] = a.i[i05]; + b.i[6] = a.i[i06]; + b.i[7] = a.i[i07]; + b.i[8] = a.i[i08]; + b.i[9] = a.i[i09]; b.i[10] = a.i[i10]; b.i[11] = a.i[i11]; b.i[12] = a.i[i12]; @@ -420,3169 +318,3284 @@ namespace v16 b.i[15] = a.i[i15]; return b; - } +} -# define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v16 &a, v16 &b ) - { +inline void swap( v16& a, v16& b ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - sw( a.i[j], b.i[j] ); - } - - inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03, - v16 &a04, v16 &a05, v16 &a06, v16 &a07, - v16 &a08, v16 &a09, v16 &a10, v16 &a11, - v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) - { - sw( a00.i[1],a01.i[0] ); sw( a00.i[2],a02.i[0] ); sw( a00.i[3],a03.i[0] ); sw( a00.i[4],a04.i[0] ); sw( a00.i[5],a05.i[0] ); sw( a00.i[6],a06.i[0] ); sw( a00.i[7],a07.i[0] ); sw( a00.i[8],a08.i[0] ); sw( a00.i[9],a09.i[0] ); sw( a00.i[10],a10.i[0] ); sw( a00.i[11],a11.i[ 0] ); sw( a00.i[12],a12.i[ 0] ); sw( a00.i[13],a13.i[ 0] ); sw( a00.i[14],a14.i[ 0] ); sw( a00.i[15],a15.i[ 0] ); - sw( a01.i[2],a02.i[1] ); sw( a01.i[3],a03.i[1] ); sw( a01.i[4],a04.i[1] ); sw( a01.i[5],a05.i[1] ); sw( a01.i[6],a06.i[1] ); sw( a01.i[7],a07.i[1] ); sw( a01.i[8],a08.i[1] ); sw( a01.i[9],a09.i[1] ); sw( a01.i[10],a10.i[1] ); sw( a01.i[11],a11.i[ 1] ); sw( a01.i[12],a12.i[ 1] ); sw( a01.i[13],a13.i[ 1] ); sw( a01.i[14],a14.i[ 1] ); sw( a01.i[15],a15.i[ 1] ); - sw( a02.i[3],a03.i[2] ); sw( a02.i[4],a04.i[2] ); sw( a02.i[5],a05.i[2] ); sw( a02.i[6],a06.i[2] ); sw( a02.i[7],a07.i[2] ); sw( a02.i[8],a08.i[2] ); sw( a02.i[9],a09.i[2] ); sw( a02.i[10],a10.i[2] ); sw( a02.i[11],a11.i[ 2] ); sw( a02.i[12],a12.i[ 2] ); sw( a02.i[13],a13.i[ 2] ); sw( a02.i[14],a14.i[ 2] ); sw( a02.i[15],a15.i[ 2] ); - sw( a03.i[4],a04.i[3] ); sw( a03.i[5],a05.i[3] ); sw( a03.i[6],a06.i[3] ); sw( a03.i[7],a07.i[3] ); sw( a03.i[8],a08.i[3] ); sw( a03.i[9],a09.i[3] ); sw( a03.i[10],a10.i[3] ); sw( a03.i[11],a11.i[ 3] ); sw( a03.i[12],a12.i[ 3] ); sw( a03.i[13],a13.i[ 3] ); sw( a03.i[14],a14.i[ 3] ); sw( a03.i[15],a15.i[ 3] ); - sw( a04.i[5],a05.i[4] ); sw( a04.i[6],a06.i[4] ); sw( a04.i[7],a07.i[4] ); sw( a04.i[8],a08.i[4] ); sw( a04.i[9],a09.i[4] ); sw( a04.i[10],a10.i[4] ); sw( a04.i[11],a11.i[ 4] ); sw( a04.i[12],a12.i[ 4] ); sw( a04.i[13],a13.i[ 4] ); sw( a04.i[14],a14.i[ 4] ); sw( a04.i[15],a15.i[ 4] ); - sw( a05.i[6],a06.i[5] ); sw( a05.i[7],a07.i[5] ); sw( a05.i[8],a08.i[5] ); sw( a05.i[9],a09.i[5] ); sw( a05.i[10],a10.i[5] ); sw( a05.i[11],a11.i[ 5] ); sw( a05.i[12],a12.i[ 5] ); sw( a05.i[13],a13.i[ 5] ); sw( a05.i[14],a14.i[ 5] ); sw( a05.i[15],a15.i[ 5] ); - sw( a06.i[7],a07.i[6] ); sw( a06.i[8],a08.i[6] ); sw( a06.i[9],a09.i[6] ); sw( a06.i[10],a10.i[6] ); sw( a06.i[11],a11.i[ 6] ); sw( a06.i[12],a12.i[ 6] ); sw( a06.i[13],a13.i[ 6] ); sw( a06.i[14],a14.i[ 6] ); sw( a06.i[15],a15.i[ 6] ); - sw( a07.i[8],a08.i[7] ); sw( a07.i[9],a09.i[7] ); sw( a07.i[10],a10.i[7] ); sw( a07.i[11],a11.i[ 7] ); sw( a07.i[12],a12.i[ 7] ); sw( a07.i[13],a13.i[ 7] ); sw( a07.i[14],a14.i[ 7] ); sw( a07.i[15],a15.i[ 7] ); - sw( a08.i[9],a09.i[8] ); sw( a08.i[10],a10.i[8] ); sw( a08.i[11],a11.i[ 8] ); sw( a08.i[12],a12.i[ 8] ); sw( a08.i[13],a13.i[ 8] ); sw( a08.i[14],a14.i[ 8] ); sw( a08.i[15],a15.i[ 8] ); - sw( a09.i[10],a10.i[9] ); sw( a09.i[11],a11.i[ 9] ); sw( a09.i[12],a12.i[ 9] ); sw( a09.i[13],a13.i[ 9] ); sw( a09.i[14],a14.i[ 9] ); sw( a09.i[15],a15.i[ 9] ); - sw( a10.i[11],a11.i[10] ); sw( a10.i[12],a12.i[10] ); sw( a10.i[13],a13.i[10] ); sw( a10.i[14],a14.i[10] ); sw( a10.i[15],a15.i[10] ); - sw( a11.i[12],a12.i[11] ); sw( a11.i[13],a13.i[11] ); sw( a11.i[14],a14.i[11] ); sw( a11.i[15],a15.i[11] ); - sw( a12.i[13],a13.i[12] ); sw( a12.i[14],a14.i[12] ); sw( a12.i[15],a15.i[12] ); - sw( a13.i[14],a14.i[13] ); sw( a13.i[15],a15.i[13] ); - sw( a14.i[15],a15.i[14] ); - } - -# undef sw - - // v16 memory manipulation functions - - inline void load_16x1( const void * ALIGNED(64) p, - v16 &a ) - { + for ( int j = 0; j < 16; j++ ) + sw( a.i[j], b.i[j] ); +} + +inline void transpose( v16& a00, v16& a01, v16& a02, v16& a03, v16& a04, + v16& a05, v16& a06, v16& a07, v16& a08, v16& a09, + v16& a10, v16& a11, v16& a12, v16& a13, v16& a14, + v16& a15 ) +{ + sw( a00.i[1], a01.i[0] ); + sw( a00.i[2], a02.i[0] ); + sw( a00.i[3], a03.i[0] ); + sw( a00.i[4], a04.i[0] ); + sw( a00.i[5], a05.i[0] ); + sw( a00.i[6], a06.i[0] ); + sw( a00.i[7], a07.i[0] ); + sw( a00.i[8], a08.i[0] ); + sw( a00.i[9], a09.i[0] ); + sw( a00.i[10], a10.i[0] ); + sw( a00.i[11], a11.i[0] ); + sw( a00.i[12], a12.i[0] ); + sw( a00.i[13], a13.i[0] ); + sw( a00.i[14], a14.i[0] ); + sw( a00.i[15], a15.i[0] ); + sw( a01.i[2], a02.i[1] ); + sw( a01.i[3], a03.i[1] ); + sw( a01.i[4], a04.i[1] ); + sw( a01.i[5], a05.i[1] ); + sw( a01.i[6], a06.i[1] ); + sw( a01.i[7], a07.i[1] ); + sw( a01.i[8], a08.i[1] ); + sw( a01.i[9], a09.i[1] ); + sw( a01.i[10], a10.i[1] ); + sw( a01.i[11], a11.i[1] ); + sw( a01.i[12], a12.i[1] ); + sw( a01.i[13], a13.i[1] ); + sw( a01.i[14], a14.i[1] ); + sw( a01.i[15], a15.i[1] ); + sw( a02.i[3], a03.i[2] ); + sw( a02.i[4], a04.i[2] ); + sw( a02.i[5], a05.i[2] ); + sw( a02.i[6], a06.i[2] ); + sw( a02.i[7], a07.i[2] ); + sw( a02.i[8], a08.i[2] ); + sw( a02.i[9], a09.i[2] ); + sw( a02.i[10], a10.i[2] ); + sw( a02.i[11], a11.i[2] ); + sw( a02.i[12], a12.i[2] ); + sw( a02.i[13], a13.i[2] ); + sw( a02.i[14], a14.i[2] ); + sw( a02.i[15], a15.i[2] ); + sw( a03.i[4], a04.i[3] ); + sw( a03.i[5], a05.i[3] ); + sw( a03.i[6], a06.i[3] ); + sw( a03.i[7], a07.i[3] ); + sw( a03.i[8], a08.i[3] ); + sw( a03.i[9], a09.i[3] ); + sw( a03.i[10], a10.i[3] ); + sw( a03.i[11], a11.i[3] ); + sw( a03.i[12], a12.i[3] ); + sw( a03.i[13], a13.i[3] ); + sw( a03.i[14], a14.i[3] ); + sw( a03.i[15], a15.i[3] ); + sw( a04.i[5], a05.i[4] ); + sw( a04.i[6], a06.i[4] ); + sw( a04.i[7], a07.i[4] ); + sw( a04.i[8], a08.i[4] ); + sw( a04.i[9], a09.i[4] ); + sw( a04.i[10], a10.i[4] ); + sw( a04.i[11], a11.i[4] ); + sw( a04.i[12], a12.i[4] ); + sw( a04.i[13], a13.i[4] ); + sw( a04.i[14], a14.i[4] ); + sw( a04.i[15], a15.i[4] ); + sw( a05.i[6], a06.i[5] ); + sw( a05.i[7], a07.i[5] ); + sw( a05.i[8], a08.i[5] ); + sw( a05.i[9], a09.i[5] ); + sw( a05.i[10], a10.i[5] ); + sw( a05.i[11], a11.i[5] ); + sw( a05.i[12], a12.i[5] ); + sw( a05.i[13], a13.i[5] ); + sw( a05.i[14], a14.i[5] ); + sw( a05.i[15], a15.i[5] ); + sw( a06.i[7], a07.i[6] ); + sw( a06.i[8], a08.i[6] ); + sw( a06.i[9], a09.i[6] ); + sw( a06.i[10], a10.i[6] ); + sw( a06.i[11], a11.i[6] ); + sw( a06.i[12], a12.i[6] ); + sw( a06.i[13], a13.i[6] ); + sw( a06.i[14], a14.i[6] ); + sw( a06.i[15], a15.i[6] ); + sw( a07.i[8], a08.i[7] ); + sw( a07.i[9], a09.i[7] ); + sw( a07.i[10], a10.i[7] ); + sw( a07.i[11], a11.i[7] ); + sw( a07.i[12], a12.i[7] ); + sw( a07.i[13], a13.i[7] ); + sw( a07.i[14], a14.i[7] ); + sw( a07.i[15], a15.i[7] ); + sw( a08.i[9], a09.i[8] ); + sw( a08.i[10], a10.i[8] ); + sw( a08.i[11], a11.i[8] ); + sw( a08.i[12], a12.i[8] ); + sw( a08.i[13], a13.i[8] ); + sw( a08.i[14], a14.i[8] ); + sw( a08.i[15], a15.i[8] ); + sw( a09.i[10], a10.i[9] ); + sw( a09.i[11], a11.i[9] ); + sw( a09.i[12], a12.i[9] ); + sw( a09.i[13], a13.i[9] ); + sw( a09.i[14], a14.i[9] ); + sw( a09.i[15], a15.i[9] ); + sw( a10.i[11], a11.i[10] ); + sw( a10.i[12], a12.i[10] ); + sw( a10.i[13], a13.i[10] ); + sw( a10.i[14], a14.i[10] ); + sw( a10.i[15], a15.i[10] ); + sw( a11.i[12], a12.i[11] ); + sw( a11.i[13], a13.i[11] ); + sw( a11.i[14], a14.i[11] ); + sw( a11.i[15], a15.i[11] ); + sw( a12.i[13], a13.i[12] ); + sw( a12.i[14], a14.i[12] ); + sw( a12.i[15], a15.i[12] ); + sw( a13.i[14], a14.i[13] ); + sw( a13.i[15], a15.i[13] ); + sw( a14.i[15], a15.i[14] ); +} + +#undef sw + +// v16 memory manipulation functions + +inline void load_16x1( const void* ALIGNED( 64 ) p, v16& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - a.i[j] = ((const int * ALIGNED(64))p)[j]; - } + for ( int j = 0; j < 16; j++ ) + a.i[j] = ( (const int* ALIGNED( 64 ))p )[j]; +} - inline void store_16x1( const v16 &a, - void * ALIGNED(64) p ) - { +inline void store_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))p)[j] = a.i[j]; - } + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))p )[j] = a.i[j]; +} - inline void stream_16x1( const v16 &a, - void * ALIGNED(64) p ) - { +inline void stream_16x1( const v16& a, void* ALIGNED( 64 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))p)[j] = a.i[j]; - } + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))p )[j] = a.i[j]; +} - inline void clear_16x1( void * ALIGNED(64) p ) - { +inline void clear_16x1( void* ALIGNED( 64 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))p)[j] = 0; - } - - // FIXME: Ordering semantics - inline void copy_16x1( void * ALIGNED(64) dst, - const void * ALIGNED(64) src ) - { + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))p )[j] = 0; +} + +// FIXME: Ordering semantics +inline void copy_16x1( void* ALIGNED( 64 ) dst, const void* ALIGNED( 64 ) src ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - ((int * ALIGNED(64))dst)[j] = ((const int * ALIGNED(64))src)[j]; - } + for ( int j = 0; j < 16; j++ ) + ( (int* ALIGNED( 64 ))dst )[j] = ( (const int* ALIGNED( 64 ))src )[j]; +} - inline void swap_16x1( void * ALIGNED(64) a, - void * ALIGNED(64) b ) - { +inline void swap_16x1( void* ALIGNED( 64 ) a, void* ALIGNED( 64 ) b ) +{ int t; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) + for ( int j = 0; j < 16; j++ ) { - t = ((int * ALIGNED(64))a)[j]; - ((int * ALIGNED(64))a)[j] = ((int * ALIGNED(64))b)[j]; - ((int * ALIGNED(64))b)[j] = t; + t = ( (int* ALIGNED( 64 ))a )[j]; + ( (int* ALIGNED( 64 ))a )[j] = ( (int* ALIGNED( 64 ))b )[j]; + ( (int* ALIGNED( 64 ))b )[j] = t; } - } - - // v16 transposed memory manipulation functions - - inline void load_16x1_tr( const void *a00, const void *a01, - const void *a02, const void *a03, - const void *a04, const void *a05, - const void *a06, const void *a07, - const void *a08, const void *a09, - const void *a10, const void *a11, - const void *a12, const void *a13, - const void *a14, const void *a15, - v16 &a ) - { - a.i[ 0] = ((const int *)a00)[0]; - a.i[ 1] = ((const int *)a01)[0]; - a.i[ 2] = ((const int *)a02)[0]; - a.i[ 3] = ((const int *)a03)[0]; - a.i[ 4] = ((const int *)a04)[0]; - a.i[ 5] = ((const int *)a05)[0]; - a.i[ 6] = ((const int *)a06)[0]; - a.i[ 7] = ((const int *)a07)[0]; - a.i[ 8] = ((const int *)a08)[0]; - a.i[ 9] = ((const int *)a09)[0]; - a.i[10] = ((const int *)a10)[0]; - a.i[11] = ((const int *)a11)[0]; - a.i[12] = ((const int *)a12)[0]; - a.i[13] = ((const int *)a13)[0]; - a.i[14] = ((const int *)a14)[0]; - a.i[15] = ((const int *)a15)[0]; - } - - inline void load_16x2_tr( const void * ALIGNED(8) a00, - const void * ALIGNED(8) a01, - const void * ALIGNED(8) a02, - const void * ALIGNED(8) a03, - const void * ALIGNED(8) a04, - const void * ALIGNED(8) a05, - const void * ALIGNED(8) a06, - const void * ALIGNED(8) a07, - const void * ALIGNED(8) a08, - const void * ALIGNED(8) a09, - const void * ALIGNED(8) a10, - const void * ALIGNED(8) a11, - const void * ALIGNED(8) a12, - const void * ALIGNED(8) a13, - const void * ALIGNED(8) a14, - const void * ALIGNED(8) a15, - v16 &a, v16 &b ) - { - a.i[ 0] = ((const int * ALIGNED(8))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(8))a00)[1]; - - a.i[ 1] = ((const int * ALIGNED(8))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(8))a01)[1]; - - a.i[ 2] = ((const int * ALIGNED(8))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(8))a02)[1]; - - a.i[ 3] = ((const int * ALIGNED(8))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(8))a03)[1]; - - a.i[ 4] = ((const int * ALIGNED(8))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(8))a04)[1]; - - a.i[ 5] = ((const int * ALIGNED(8))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(8))a05)[1]; - - a.i[ 6] = ((const int * ALIGNED(8))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(8))a06)[1]; - - a.i[ 7] = ((const int * ALIGNED(8))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(8))a07)[1]; - - a.i[ 8] = ((const int * ALIGNED(8))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(8))a08)[1]; - - a.i[ 9] = ((const int * ALIGNED(8))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(8))a09)[1]; - - a.i[10] = ((const int * ALIGNED(8))a10)[0]; - b.i[10] = ((const int * ALIGNED(8))a10)[1]; - - a.i[11] = ((const int * ALIGNED(8))a11)[0]; - b.i[11] = ((const int * ALIGNED(8))a11)[1]; - - a.i[12] = ((const int * ALIGNED(8))a12)[0]; - b.i[12] = ((const int * ALIGNED(8))a12)[1]; - - a.i[13] = ((const int * ALIGNED(8))a13)[0]; - b.i[13] = ((const int * ALIGNED(8))a13)[1]; - - a.i[14] = ((const int * ALIGNED(8))a14)[0]; - b.i[14] = ((const int * ALIGNED(8))a14)[1]; - - a.i[15] = ((const int * ALIGNED(8))a15)[0]; - b.i[15] = ((const int * ALIGNED(8))a15)[1]; - } - - inline void load_16x3_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - } - - inline void load_16x4_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - d.i[ 0] = ((const int * ALIGNED(64))a00)[3]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - d.i[ 1] = ((const int * ALIGNED(64))a01)[3]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - d.i[ 2] = ((const int * ALIGNED(64))a02)[3]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - d.i[ 3] = ((const int * ALIGNED(64))a03)[3]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - d.i[ 4] = ((const int * ALIGNED(64))a04)[3]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - d.i[ 5] = ((const int * ALIGNED(64))a05)[3]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - d.i[ 6] = ((const int * ALIGNED(64))a06)[3]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - d.i[ 7] = ((const int * ALIGNED(64))a07)[3]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - d.i[ 8] = ((const int * ALIGNED(64))a08)[3]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - d.i[ 9] = ((const int * ALIGNED(64))a09)[3]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - d.i[10] = ((const int * ALIGNED(64))a10)[3]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - d.i[11] = ((const int * ALIGNED(64))a11)[3]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - d.i[12] = ((const int * ALIGNED(64))a12)[3]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - d.i[13] = ((const int * ALIGNED(64))a13)[3]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - d.i[14] = ((const int * ALIGNED(64))a14)[3]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - d.i[15] = ((const int * ALIGNED(64))a15)[3]; - } - - inline void load_16x8_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &a, v16 &b, v16 &c, v16 &d, - v16 &e, v16 &f, v16 &g, v16 &h ) - { - a.i[ 0] = ((const int * ALIGNED(64))a00)[0]; - b.i[ 0] = ((const int * ALIGNED(64))a00)[1]; - c.i[ 0] = ((const int * ALIGNED(64))a00)[2]; - d.i[ 0] = ((const int * ALIGNED(64))a00)[3]; - e.i[ 0] = ((const int * ALIGNED(64))a00)[4]; - f.i[ 0] = ((const int * ALIGNED(64))a00)[5]; - g.i[ 0] = ((const int * ALIGNED(64))a00)[6]; - h.i[ 0] = ((const int * ALIGNED(64))a00)[7]; - - a.i[ 1] = ((const int * ALIGNED(64))a01)[0]; - b.i[ 1] = ((const int * ALIGNED(64))a01)[1]; - c.i[ 1] = ((const int * ALIGNED(64))a01)[2]; - d.i[ 1] = ((const int * ALIGNED(64))a01)[3]; - e.i[ 1] = ((const int * ALIGNED(64))a01)[4]; - f.i[ 1] = ((const int * ALIGNED(64))a01)[5]; - g.i[ 1] = ((const int * ALIGNED(64))a01)[6]; - h.i[ 1] = ((const int * ALIGNED(64))a01)[7]; - - a.i[ 2] = ((const int * ALIGNED(64))a02)[0]; - b.i[ 2] = ((const int * ALIGNED(64))a02)[1]; - c.i[ 2] = ((const int * ALIGNED(64))a02)[2]; - d.i[ 2] = ((const int * ALIGNED(64))a02)[3]; - e.i[ 2] = ((const int * ALIGNED(64))a02)[4]; - f.i[ 2] = ((const int * ALIGNED(64))a02)[5]; - g.i[ 2] = ((const int * ALIGNED(64))a02)[6]; - h.i[ 2] = ((const int * ALIGNED(64))a02)[7]; - - a.i[ 3] = ((const int * ALIGNED(64))a03)[0]; - b.i[ 3] = ((const int * ALIGNED(64))a03)[1]; - c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; - d.i[ 3] = ((const int * ALIGNED(64))a03)[3]; - e.i[ 3] = ((const int * ALIGNED(64))a03)[4]; - f.i[ 3] = ((const int * ALIGNED(64))a03)[5]; - g.i[ 3] = ((const int * ALIGNED(64))a03)[6]; - h.i[ 3] = ((const int * ALIGNED(64))a03)[7]; - - a.i[ 4] = ((const int * ALIGNED(64))a04)[0]; - b.i[ 4] = ((const int * ALIGNED(64))a04)[1]; - c.i[ 4] = ((const int * ALIGNED(64))a04)[2]; - d.i[ 4] = ((const int * ALIGNED(64))a04)[3]; - e.i[ 4] = ((const int * ALIGNED(64))a04)[4]; - f.i[ 4] = ((const int * ALIGNED(64))a04)[5]; - g.i[ 4] = ((const int * ALIGNED(64))a04)[6]; - h.i[ 4] = ((const int * ALIGNED(64))a04)[7]; - - a.i[ 5] = ((const int * ALIGNED(64))a05)[0]; - b.i[ 5] = ((const int * ALIGNED(64))a05)[1]; - c.i[ 5] = ((const int * ALIGNED(64))a05)[2]; - d.i[ 5] = ((const int * ALIGNED(64))a05)[3]; - e.i[ 5] = ((const int * ALIGNED(64))a05)[4]; - f.i[ 5] = ((const int * ALIGNED(64))a05)[5]; - g.i[ 5] = ((const int * ALIGNED(64))a05)[6]; - h.i[ 5] = ((const int * ALIGNED(64))a05)[7]; - - a.i[ 6] = ((const int * ALIGNED(64))a06)[0]; - b.i[ 6] = ((const int * ALIGNED(64))a06)[1]; - c.i[ 6] = ((const int * ALIGNED(64))a06)[2]; - d.i[ 6] = ((const int * ALIGNED(64))a06)[3]; - e.i[ 6] = ((const int * ALIGNED(64))a06)[4]; - f.i[ 6] = ((const int * ALIGNED(64))a06)[5]; - g.i[ 6] = ((const int * ALIGNED(64))a06)[6]; - h.i[ 6] = ((const int * ALIGNED(64))a06)[7]; - - a.i[ 7] = ((const int * ALIGNED(64))a07)[0]; - b.i[ 7] = ((const int * ALIGNED(64))a07)[1]; - c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; - d.i[ 7] = ((const int * ALIGNED(64))a07)[3]; - e.i[ 7] = ((const int * ALIGNED(64))a07)[4]; - f.i[ 7] = ((const int * ALIGNED(64))a07)[5]; - g.i[ 7] = ((const int * ALIGNED(64))a07)[6]; - h.i[ 7] = ((const int * ALIGNED(64))a07)[7]; - - a.i[ 8] = ((const int * ALIGNED(64))a08)[0]; - b.i[ 8] = ((const int * ALIGNED(64))a08)[1]; - c.i[ 8] = ((const int * ALIGNED(64))a08)[2]; - d.i[ 8] = ((const int * ALIGNED(64))a08)[3]; - e.i[ 8] = ((const int * ALIGNED(64))a08)[4]; - f.i[ 8] = ((const int * ALIGNED(64))a08)[5]; - g.i[ 8] = ((const int * ALIGNED(64))a08)[6]; - h.i[ 8] = ((const int * ALIGNED(64))a08)[7]; - - a.i[ 9] = ((const int * ALIGNED(64))a09)[0]; - b.i[ 9] = ((const int * ALIGNED(64))a09)[1]; - c.i[ 9] = ((const int * ALIGNED(64))a09)[2]; - d.i[ 9] = ((const int * ALIGNED(64))a09)[3]; - e.i[ 9] = ((const int * ALIGNED(64))a09)[4]; - f.i[ 9] = ((const int * ALIGNED(64))a09)[5]; - g.i[ 9] = ((const int * ALIGNED(64))a09)[6]; - h.i[ 9] = ((const int * ALIGNED(64))a09)[7]; - - a.i[10] = ((const int * ALIGNED(64))a10)[0]; - b.i[10] = ((const int * ALIGNED(64))a10)[1]; - c.i[10] = ((const int * ALIGNED(64))a10)[2]; - d.i[10] = ((const int * ALIGNED(64))a10)[3]; - e.i[10] = ((const int * ALIGNED(64))a10)[4]; - f.i[10] = ((const int * ALIGNED(64))a10)[5]; - g.i[10] = ((const int * ALIGNED(64))a10)[6]; - h.i[10] = ((const int * ALIGNED(64))a10)[7]; - - a.i[11] = ((const int * ALIGNED(64))a11)[0]; - b.i[11] = ((const int * ALIGNED(64))a11)[1]; - c.i[11] = ((const int * ALIGNED(64))a11)[2]; - d.i[11] = ((const int * ALIGNED(64))a11)[3]; - e.i[11] = ((const int * ALIGNED(64))a11)[4]; - f.i[11] = ((const int * ALIGNED(64))a11)[5]; - g.i[11] = ((const int * ALIGNED(64))a11)[6]; - h.i[11] = ((const int * ALIGNED(64))a11)[7]; - - a.i[12] = ((const int * ALIGNED(64))a12)[0]; - b.i[12] = ((const int * ALIGNED(64))a12)[1]; - c.i[12] = ((const int * ALIGNED(64))a12)[2]; - d.i[12] = ((const int * ALIGNED(64))a12)[3]; - e.i[12] = ((const int * ALIGNED(64))a12)[4]; - f.i[12] = ((const int * ALIGNED(64))a12)[5]; - g.i[12] = ((const int * ALIGNED(64))a12)[6]; - h.i[12] = ((const int * ALIGNED(64))a12)[7]; - - a.i[13] = ((const int * ALIGNED(64))a13)[0]; - b.i[13] = ((const int * ALIGNED(64))a13)[1]; - c.i[13] = ((const int * ALIGNED(64))a13)[2]; - d.i[13] = ((const int * ALIGNED(64))a13)[3]; - e.i[13] = ((const int * ALIGNED(64))a13)[4]; - f.i[13] = ((const int * ALIGNED(64))a13)[5]; - g.i[13] = ((const int * ALIGNED(64))a13)[6]; - h.i[13] = ((const int * ALIGNED(64))a13)[7]; - - a.i[14] = ((const int * ALIGNED(64))a14)[0]; - b.i[14] = ((const int * ALIGNED(64))a14)[1]; - c.i[14] = ((const int * ALIGNED(64))a14)[2]; - d.i[14] = ((const int * ALIGNED(64))a14)[3]; - e.i[14] = ((const int * ALIGNED(64))a14)[4]; - f.i[14] = ((const int * ALIGNED(64))a14)[5]; - g.i[14] = ((const int * ALIGNED(64))a14)[6]; - h.i[14] = ((const int * ALIGNED(64))a14)[7]; - - a.i[15] = ((const int * ALIGNED(64))a15)[0]; - b.i[15] = ((const int * ALIGNED(64))a15)[1]; - c.i[15] = ((const int * ALIGNED(64))a15)[2]; - d.i[15] = ((const int * ALIGNED(64))a15)[3]; - e.i[15] = ((const int * ALIGNED(64))a15)[4]; - f.i[15] = ((const int * ALIGNED(64))a15)[5]; - g.i[15] = ((const int * ALIGNED(64))a15)[6]; - h.i[15] = ((const int * ALIGNED(64))a15)[7]; - } - - inline void load_16x16_tr( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b08.i[ 0] = ((const int * ALIGNED(64))a00)[ 8]; - b09.i[ 0] = ((const int * ALIGNED(64))a00)[ 9]; - b10.i[ 0] = ((const int * ALIGNED(64))a00)[10]; - b11.i[ 0] = ((const int * ALIGNED(64))a00)[11]; - b12.i[ 0] = ((const int * ALIGNED(64))a00)[12]; - b13.i[ 0] = ((const int * ALIGNED(64))a00)[13]; - b14.i[ 0] = ((const int * ALIGNED(64))a00)[14]; - b15.i[ 0] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 1] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 1] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 1] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 1] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 1] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 1] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 1] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 1] = ((const int * ALIGNED(64))a01)[ 7]; - b08.i[ 1] = ((const int * ALIGNED(64))a01)[ 8]; - b09.i[ 1] = ((const int * ALIGNED(64))a01)[ 9]; - b10.i[ 1] = ((const int * ALIGNED(64))a01)[10]; - b11.i[ 1] = ((const int * ALIGNED(64))a01)[11]; - b12.i[ 1] = ((const int * ALIGNED(64))a01)[12]; - b13.i[ 1] = ((const int * ALIGNED(64))a01)[13]; - b14.i[ 1] = ((const int * ALIGNED(64))a01)[14]; - b15.i[ 1] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a02)[ 7]; - b08.i[ 2] = ((const int * ALIGNED(64))a02)[ 8]; - b09.i[ 2] = ((const int * ALIGNED(64))a02)[ 9]; - b10.i[ 2] = ((const int * ALIGNED(64))a02)[10]; - b11.i[ 2] = ((const int * ALIGNED(64))a02)[11]; - b12.i[ 2] = ((const int * ALIGNED(64))a02)[12]; - b13.i[ 2] = ((const int * ALIGNED(64))a02)[13]; - b14.i[ 2] = ((const int * ALIGNED(64))a02)[14]; - b15.i[ 2] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 3] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 3] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 3] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 3] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 3] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 3] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 3] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 3] = ((const int * ALIGNED(64))a03)[ 7]; - b08.i[ 3] = ((const int * ALIGNED(64))a03)[ 8]; - b09.i[ 3] = ((const int * ALIGNED(64))a03)[ 9]; - b10.i[ 3] = ((const int * ALIGNED(64))a03)[10]; - b11.i[ 3] = ((const int * ALIGNED(64))a03)[11]; - b12.i[ 3] = ((const int * ALIGNED(64))a03)[12]; - b13.i[ 3] = ((const int * ALIGNED(64))a03)[13]; - b14.i[ 3] = ((const int * ALIGNED(64))a03)[14]; - b15.i[ 3] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a04)[ 7]; - b08.i[ 4] = ((const int * ALIGNED(64))a04)[ 8]; - b09.i[ 4] = ((const int * ALIGNED(64))a04)[ 9]; - b10.i[ 4] = ((const int * ALIGNED(64))a04)[10]; - b11.i[ 4] = ((const int * ALIGNED(64))a04)[11]; - b12.i[ 4] = ((const int * ALIGNED(64))a04)[12]; - b13.i[ 4] = ((const int * ALIGNED(64))a04)[13]; - b14.i[ 4] = ((const int * ALIGNED(64))a04)[14]; - b15.i[ 4] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[ 5] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[ 5] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[ 5] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[ 5] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[ 5] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[ 5] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[ 5] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[ 5] = ((const int * ALIGNED(64))a05)[ 7]; - b08.i[ 5] = ((const int * ALIGNED(64))a05)[ 8]; - b09.i[ 5] = ((const int * ALIGNED(64))a05)[ 9]; - b10.i[ 5] = ((const int * ALIGNED(64))a05)[10]; - b11.i[ 5] = ((const int * ALIGNED(64))a05)[11]; - b12.i[ 5] = ((const int * ALIGNED(64))a05)[12]; - b13.i[ 5] = ((const int * ALIGNED(64))a05)[13]; - b14.i[ 5] = ((const int * ALIGNED(64))a05)[14]; - b15.i[ 5] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a06)[ 7]; - b08.i[ 6] = ((const int * ALIGNED(64))a06)[ 8]; - b09.i[ 6] = ((const int * ALIGNED(64))a06)[ 9]; - b10.i[ 6] = ((const int * ALIGNED(64))a06)[10]; - b11.i[ 6] = ((const int * ALIGNED(64))a06)[11]; - b12.i[ 6] = ((const int * ALIGNED(64))a06)[12]; - b13.i[ 6] = ((const int * ALIGNED(64))a06)[13]; - b14.i[ 6] = ((const int * ALIGNED(64))a06)[14]; - b15.i[ 6] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[ 7] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[ 7] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[ 7] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[ 7] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[ 7] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[ 7] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[ 7] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[ 7] = ((const int * ALIGNED(64))a07)[ 7]; - b08.i[ 7] = ((const int * ALIGNED(64))a07)[ 8]; - b09.i[ 7] = ((const int * ALIGNED(64))a07)[ 9]; - b10.i[ 7] = ((const int * ALIGNED(64))a07)[10]; - b11.i[ 7] = ((const int * ALIGNED(64))a07)[11]; - b12.i[ 7] = ((const int * ALIGNED(64))a07)[12]; - b13.i[ 7] = ((const int * ALIGNED(64))a07)[13]; - b14.i[ 7] = ((const int * ALIGNED(64))a07)[14]; - b15.i[ 7] = ((const int * ALIGNED(64))a07)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a08)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a08)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a08)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a08)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a08)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a08)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a08)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a08)[ 7]; - b08.i[ 8] = ((const int * ALIGNED(64))a08)[ 8]; - b09.i[ 8] = ((const int * ALIGNED(64))a08)[ 9]; - b10.i[ 8] = ((const int * ALIGNED(64))a08)[10]; - b11.i[ 8] = ((const int * ALIGNED(64))a08)[11]; - b12.i[ 8] = ((const int * ALIGNED(64))a08)[12]; - b13.i[ 8] = ((const int * ALIGNED(64))a08)[13]; - b14.i[ 8] = ((const int * ALIGNED(64))a08)[14]; - b15.i[ 8] = ((const int * ALIGNED(64))a08)[15]; - - b00.i[ 9] = ((const int * ALIGNED(64))a09)[ 0]; - b01.i[ 9] = ((const int * ALIGNED(64))a09)[ 1]; - b02.i[ 9] = ((const int * ALIGNED(64))a09)[ 2]; - b03.i[ 9] = ((const int * ALIGNED(64))a09)[ 3]; - b04.i[ 9] = ((const int * ALIGNED(64))a09)[ 4]; - b05.i[ 9] = ((const int * ALIGNED(64))a09)[ 5]; - b06.i[ 9] = ((const int * ALIGNED(64))a09)[ 6]; - b07.i[ 9] = ((const int * ALIGNED(64))a09)[ 7]; - b08.i[ 9] = ((const int * ALIGNED(64))a09)[ 8]; - b09.i[ 9] = ((const int * ALIGNED(64))a09)[ 9]; - b10.i[ 9] = ((const int * ALIGNED(64))a09)[10]; - b11.i[ 9] = ((const int * ALIGNED(64))a09)[11]; - b12.i[ 9] = ((const int * ALIGNED(64))a09)[12]; - b13.i[ 9] = ((const int * ALIGNED(64))a09)[13]; - b14.i[ 9] = ((const int * ALIGNED(64))a09)[14]; - b15.i[ 9] = ((const int * ALIGNED(64))a09)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a10)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a10)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a10)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a10)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a10)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a10)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a10)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a10)[ 7]; - b08.i[10] = ((const int * ALIGNED(64))a10)[ 8]; - b09.i[10] = ((const int * ALIGNED(64))a10)[ 9]; - b10.i[10] = ((const int * ALIGNED(64))a10)[10]; - b11.i[10] = ((const int * ALIGNED(64))a10)[11]; - b12.i[10] = ((const int * ALIGNED(64))a10)[12]; - b13.i[10] = ((const int * ALIGNED(64))a10)[13]; - b14.i[10] = ((const int * ALIGNED(64))a10)[14]; - b15.i[10] = ((const int * ALIGNED(64))a10)[15]; - - b00.i[11] = ((const int * ALIGNED(64))a11)[ 0]; - b01.i[11] = ((const int * ALIGNED(64))a11)[ 1]; - b02.i[11] = ((const int * ALIGNED(64))a11)[ 2]; - b03.i[11] = ((const int * ALIGNED(64))a11)[ 3]; - b04.i[11] = ((const int * ALIGNED(64))a11)[ 4]; - b05.i[11] = ((const int * ALIGNED(64))a11)[ 5]; - b06.i[11] = ((const int * ALIGNED(64))a11)[ 6]; - b07.i[11] = ((const int * ALIGNED(64))a11)[ 7]; - b08.i[11] = ((const int * ALIGNED(64))a11)[ 8]; - b09.i[11] = ((const int * ALIGNED(64))a11)[ 9]; - b10.i[11] = ((const int * ALIGNED(64))a11)[10]; - b11.i[11] = ((const int * ALIGNED(64))a11)[11]; - b12.i[11] = ((const int * ALIGNED(64))a11)[12]; - b13.i[11] = ((const int * ALIGNED(64))a11)[13]; - b14.i[11] = ((const int * ALIGNED(64))a11)[14]; - b15.i[11] = ((const int * ALIGNED(64))a11)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a12)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a12)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a12)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a12)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a12)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a12)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a12)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a12)[ 7]; - b08.i[12] = ((const int * ALIGNED(64))a12)[ 8]; - b09.i[12] = ((const int * ALIGNED(64))a12)[ 9]; - b10.i[12] = ((const int * ALIGNED(64))a12)[10]; - b11.i[12] = ((const int * ALIGNED(64))a12)[11]; - b12.i[12] = ((const int * ALIGNED(64))a12)[12]; - b13.i[12] = ((const int * ALIGNED(64))a12)[13]; - b14.i[12] = ((const int * ALIGNED(64))a12)[14]; - b15.i[12] = ((const int * ALIGNED(64))a12)[15]; - - b00.i[13] = ((const int * ALIGNED(64))a13)[ 0]; - b01.i[13] = ((const int * ALIGNED(64))a13)[ 1]; - b02.i[13] = ((const int * ALIGNED(64))a13)[ 2]; - b03.i[13] = ((const int * ALIGNED(64))a13)[ 3]; - b04.i[13] = ((const int * ALIGNED(64))a13)[ 4]; - b05.i[13] = ((const int * ALIGNED(64))a13)[ 5]; - b06.i[13] = ((const int * ALIGNED(64))a13)[ 6]; - b07.i[13] = ((const int * ALIGNED(64))a13)[ 7]; - b08.i[13] = ((const int * ALIGNED(64))a13)[ 8]; - b09.i[13] = ((const int * ALIGNED(64))a13)[ 9]; - b10.i[13] = ((const int * ALIGNED(64))a13)[10]; - b11.i[13] = ((const int * ALIGNED(64))a13)[11]; - b12.i[13] = ((const int * ALIGNED(64))a13)[12]; - b13.i[13] = ((const int * ALIGNED(64))a13)[13]; - b14.i[13] = ((const int * ALIGNED(64))a13)[14]; - b15.i[13] = ((const int * ALIGNED(64))a13)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a14)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a14)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a14)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a14)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a14)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a14)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a14)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a14)[ 7]; - b08.i[14] = ((const int * ALIGNED(64))a14)[ 8]; - b09.i[14] = ((const int * ALIGNED(64))a14)[ 9]; - b10.i[14] = ((const int * ALIGNED(64))a14)[10]; - b11.i[14] = ((const int * ALIGNED(64))a14)[11]; - b12.i[14] = ((const int * ALIGNED(64))a14)[12]; - b13.i[14] = ((const int * ALIGNED(64))a14)[13]; - b14.i[14] = ((const int * ALIGNED(64))a14)[14]; - b15.i[14] = ((const int * ALIGNED(64))a14)[15]; - - b00.i[15] = ((const int * ALIGNED(64))a15)[ 0]; - b01.i[15] = ((const int * ALIGNED(64))a15)[ 1]; - b02.i[15] = ((const int * ALIGNED(64))a15)[ 2]; - b03.i[15] = ((const int * ALIGNED(64))a15)[ 3]; - b04.i[15] = ((const int * ALIGNED(64))a15)[ 4]; - b05.i[15] = ((const int * ALIGNED(64))a15)[ 5]; - b06.i[15] = ((const int * ALIGNED(64))a15)[ 6]; - b07.i[15] = ((const int * ALIGNED(64))a15)[ 7]; - b08.i[15] = ((const int * ALIGNED(64))a15)[ 8]; - b09.i[15] = ((const int * ALIGNED(64))a15)[ 9]; - b10.i[15] = ((const int * ALIGNED(64))a15)[10]; - b11.i[15] = ((const int * ALIGNED(64))a15)[11]; - b12.i[15] = ((const int * ALIGNED(64))a15)[12]; - b13.i[15] = ((const int * ALIGNED(64))a15)[13]; - b14.i[15] = ((const int * ALIGNED(64))a15)[14]; - b15.i[15] = ((const int * ALIGNED(64))a15)[15]; - } - - inline void load_16x8_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8]; - b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9]; - b02.i[ 1] = ((const int * ALIGNED(64))a00)[10]; - b03.i[ 1] = ((const int * ALIGNED(64))a00)[11]; - b04.i[ 1] = ((const int * ALIGNED(64))a00)[12]; - b05.i[ 1] = ((const int * ALIGNED(64))a00)[13]; - b06.i[ 1] = ((const int * ALIGNED(64))a00)[14]; - b07.i[ 1] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7]; - b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8]; - b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9]; - b02.i[ 3] = ((const int * ALIGNED(64))a01)[10]; - b03.i[ 3] = ((const int * ALIGNED(64))a01)[11]; - b04.i[ 3] = ((const int * ALIGNED(64))a01)[12]; - b05.i[ 3] = ((const int * ALIGNED(64))a01)[13]; - b06.i[ 3] = ((const int * ALIGNED(64))a01)[14]; - b07.i[ 3] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7]; - b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8]; - b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9]; - b02.i[ 5] = ((const int * ALIGNED(64))a02)[10]; - b03.i[ 5] = ((const int * ALIGNED(64))a02)[11]; - b04.i[ 5] = ((const int * ALIGNED(64))a02)[12]; - b05.i[ 5] = ((const int * ALIGNED(64))a02)[13]; - b06.i[ 5] = ((const int * ALIGNED(64))a02)[14]; - b07.i[ 5] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7]; - b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8]; - b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9]; - b02.i[ 7] = ((const int * ALIGNED(64))a03)[10]; - b03.i[ 7] = ((const int * ALIGNED(64))a03)[11]; - b04.i[ 7] = ((const int * ALIGNED(64))a03)[12]; - b05.i[ 7] = ((const int * ALIGNED(64))a03)[13]; - b06.i[ 7] = ((const int * ALIGNED(64))a03)[14]; - b07.i[ 7] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7]; - b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8]; - b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9]; - b02.i[ 9] = ((const int * ALIGNED(64))a04)[10]; - b03.i[ 9] = ((const int * ALIGNED(64))a04)[11]; - b04.i[ 9] = ((const int * ALIGNED(64))a04)[12]; - b05.i[ 9] = ((const int * ALIGNED(64))a04)[13]; - b06.i[ 9] = ((const int * ALIGNED(64))a04)[14]; - b07.i[ 9] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a05)[ 7]; - b00.i[11] = ((const int * ALIGNED(64))a05)[ 8]; - b01.i[11] = ((const int * ALIGNED(64))a05)[ 9]; - b02.i[11] = ((const int * ALIGNED(64))a05)[10]; - b03.i[11] = ((const int * ALIGNED(64))a05)[11]; - b04.i[11] = ((const int * ALIGNED(64))a05)[12]; - b05.i[11] = ((const int * ALIGNED(64))a05)[13]; - b06.i[11] = ((const int * ALIGNED(64))a05)[14]; - b07.i[11] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a06)[ 7]; - b00.i[13] = ((const int * ALIGNED(64))a06)[ 8]; - b01.i[13] = ((const int * ALIGNED(64))a06)[ 9]; - b02.i[13] = ((const int * ALIGNED(64))a06)[10]; - b03.i[13] = ((const int * ALIGNED(64))a06)[11]; - b04.i[13] = ((const int * ALIGNED(64))a06)[12]; - b05.i[13] = ((const int * ALIGNED(64))a06)[13]; - b06.i[13] = ((const int * ALIGNED(64))a06)[14]; - b07.i[13] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a07)[ 7]; - b00.i[15] = ((const int * ALIGNED(64))a07)[ 8]; - b01.i[15] = ((const int * ALIGNED(64))a07)[ 9]; - b02.i[15] = ((const int * ALIGNED(64))a07)[10]; - b03.i[15] = ((const int * ALIGNED(64))a07)[11]; - b04.i[15] = ((const int * ALIGNED(64))a07)[12]; - b05.i[15] = ((const int * ALIGNED(64))a07)[13]; - b06.i[15] = ((const int * ALIGNED(64))a07)[14]; - b07.i[15] = ((const int * ALIGNED(64))a07)[15]; - } - - inline void load_16x16_tr_p( const void * ALIGNED(64) a00, - const void * ALIGNED(64) a01, - const void * ALIGNED(64) a02, - const void * ALIGNED(64) a03, - const void * ALIGNED(64) a04, - const void * ALIGNED(64) a05, - const void * ALIGNED(64) a06, - const void * ALIGNED(64) a07, - const void * ALIGNED(64) a08, - const void * ALIGNED(64) a09, - const void * ALIGNED(64) a10, - const void * ALIGNED(64) a11, - const void * ALIGNED(64) a12, - const void * ALIGNED(64) a13, - const void * ALIGNED(64) a14, - const void * ALIGNED(64) a15, - v16 &b00, v16 &b01, v16 &b02, v16 &b03, - v16 &b04, v16 &b05, v16 &b06, v16 &b07, - v16 &b08, v16 &b09, v16 &b10, v16 &b11, - v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) - { - b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0]; - b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1]; - b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2]; - b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3]; - b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4]; - b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5]; - b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6]; - b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7]; - b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8]; - b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9]; - b02.i[ 1] = ((const int * ALIGNED(64))a00)[10]; - b03.i[ 1] = ((const int * ALIGNED(64))a00)[11]; - b04.i[ 1] = ((const int * ALIGNED(64))a00)[12]; - b05.i[ 1] = ((const int * ALIGNED(64))a00)[13]; - b06.i[ 1] = ((const int * ALIGNED(64))a00)[14]; - b07.i[ 1] = ((const int * ALIGNED(64))a00)[15]; - - b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0]; - b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1]; - b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2]; - b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3]; - b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4]; - b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5]; - b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6]; - b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7]; - b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8]; - b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9]; - b02.i[ 3] = ((const int * ALIGNED(64))a01)[10]; - b03.i[ 3] = ((const int * ALIGNED(64))a01)[11]; - b04.i[ 3] = ((const int * ALIGNED(64))a01)[12]; - b05.i[ 3] = ((const int * ALIGNED(64))a01)[13]; - b06.i[ 3] = ((const int * ALIGNED(64))a01)[14]; - b07.i[ 3] = ((const int * ALIGNED(64))a01)[15]; - - b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0]; - b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1]; - b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2]; - b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3]; - b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4]; - b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5]; - b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6]; - b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7]; - b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8]; - b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9]; - b02.i[ 5] = ((const int * ALIGNED(64))a02)[10]; - b03.i[ 5] = ((const int * ALIGNED(64))a02)[11]; - b04.i[ 5] = ((const int * ALIGNED(64))a02)[12]; - b05.i[ 5] = ((const int * ALIGNED(64))a02)[13]; - b06.i[ 5] = ((const int * ALIGNED(64))a02)[14]; - b07.i[ 5] = ((const int * ALIGNED(64))a02)[15]; - - b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0]; - b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1]; - b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2]; - b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3]; - b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4]; - b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5]; - b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6]; - b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7]; - b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8]; - b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9]; - b02.i[ 7] = ((const int * ALIGNED(64))a03)[10]; - b03.i[ 7] = ((const int * ALIGNED(64))a03)[11]; - b04.i[ 7] = ((const int * ALIGNED(64))a03)[12]; - b05.i[ 7] = ((const int * ALIGNED(64))a03)[13]; - b06.i[ 7] = ((const int * ALIGNED(64))a03)[14]; - b07.i[ 7] = ((const int * ALIGNED(64))a03)[15]; - - b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0]; - b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1]; - b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2]; - b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3]; - b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4]; - b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5]; - b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6]; - b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7]; - b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8]; - b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9]; - b02.i[ 9] = ((const int * ALIGNED(64))a04)[10]; - b03.i[ 9] = ((const int * ALIGNED(64))a04)[11]; - b04.i[ 9] = ((const int * ALIGNED(64))a04)[12]; - b05.i[ 9] = ((const int * ALIGNED(64))a04)[13]; - b06.i[ 9] = ((const int * ALIGNED(64))a04)[14]; - b07.i[ 9] = ((const int * ALIGNED(64))a04)[15]; - - b00.i[10] = ((const int * ALIGNED(64))a05)[ 0]; - b01.i[10] = ((const int * ALIGNED(64))a05)[ 1]; - b02.i[10] = ((const int * ALIGNED(64))a05)[ 2]; - b03.i[10] = ((const int * ALIGNED(64))a05)[ 3]; - b04.i[10] = ((const int * ALIGNED(64))a05)[ 4]; - b05.i[10] = ((const int * ALIGNED(64))a05)[ 5]; - b06.i[10] = ((const int * ALIGNED(64))a05)[ 6]; - b07.i[10] = ((const int * ALIGNED(64))a05)[ 7]; - b00.i[11] = ((const int * ALIGNED(64))a05)[ 8]; - b01.i[11] = ((const int * ALIGNED(64))a05)[ 9]; - b02.i[11] = ((const int * ALIGNED(64))a05)[10]; - b03.i[11] = ((const int * ALIGNED(64))a05)[11]; - b04.i[11] = ((const int * ALIGNED(64))a05)[12]; - b05.i[11] = ((const int * ALIGNED(64))a05)[13]; - b06.i[11] = ((const int * ALIGNED(64))a05)[14]; - b07.i[11] = ((const int * ALIGNED(64))a05)[15]; - - b00.i[12] = ((const int * ALIGNED(64))a06)[ 0]; - b01.i[12] = ((const int * ALIGNED(64))a06)[ 1]; - b02.i[12] = ((const int * ALIGNED(64))a06)[ 2]; - b03.i[12] = ((const int * ALIGNED(64))a06)[ 3]; - b04.i[12] = ((const int * ALIGNED(64))a06)[ 4]; - b05.i[12] = ((const int * ALIGNED(64))a06)[ 5]; - b06.i[12] = ((const int * ALIGNED(64))a06)[ 6]; - b07.i[12] = ((const int * ALIGNED(64))a06)[ 7]; - b00.i[13] = ((const int * ALIGNED(64))a06)[ 8]; - b01.i[13] = ((const int * ALIGNED(64))a06)[ 9]; - b02.i[13] = ((const int * ALIGNED(64))a06)[10]; - b03.i[13] = ((const int * ALIGNED(64))a06)[11]; - b04.i[13] = ((const int * ALIGNED(64))a06)[12]; - b05.i[13] = ((const int * ALIGNED(64))a06)[13]; - b06.i[13] = ((const int * ALIGNED(64))a06)[14]; - b07.i[13] = ((const int * ALIGNED(64))a06)[15]; - - b00.i[14] = ((const int * ALIGNED(64))a07)[ 0]; - b01.i[14] = ((const int * ALIGNED(64))a07)[ 1]; - b02.i[14] = ((const int * ALIGNED(64))a07)[ 2]; - b03.i[14] = ((const int * ALIGNED(64))a07)[ 3]; - b04.i[14] = ((const int * ALIGNED(64))a07)[ 4]; - b05.i[14] = ((const int * ALIGNED(64))a07)[ 5]; - b06.i[14] = ((const int * ALIGNED(64))a07)[ 6]; - b07.i[14] = ((const int * ALIGNED(64))a07)[ 7]; - b00.i[15] = ((const int * ALIGNED(64))a07)[ 8]; - b01.i[15] = ((const int * ALIGNED(64))a07)[ 9]; - b02.i[15] = ((const int * ALIGNED(64))a07)[10]; - b03.i[15] = ((const int * ALIGNED(64))a07)[11]; - b04.i[15] = ((const int * ALIGNED(64))a07)[12]; - b05.i[15] = ((const int * ALIGNED(64))a07)[13]; - b06.i[15] = ((const int * ALIGNED(64))a07)[14]; - b07.i[15] = ((const int * ALIGNED(64))a07)[15]; - - b08.i[ 0] = ((const int * ALIGNED(64))a08)[ 0]; - b09.i[ 0] = ((const int * ALIGNED(64))a08)[ 1]; - b10.i[ 0] = ((const int * ALIGNED(64))a08)[ 2]; - b11.i[ 0] = ((const int * ALIGNED(64))a08)[ 3]; - b12.i[ 0] = ((const int * ALIGNED(64))a08)[ 4]; - b13.i[ 0] = ((const int * ALIGNED(64))a08)[ 5]; - b14.i[ 0] = ((const int * ALIGNED(64))a08)[ 6]; - b15.i[ 0] = ((const int * ALIGNED(64))a08)[ 7]; - b08.i[ 1] = ((const int * ALIGNED(64))a08)[ 8]; - b09.i[ 1] = ((const int * ALIGNED(64))a08)[ 9]; - b10.i[ 1] = ((const int * ALIGNED(64))a08)[10]; - b11.i[ 1] = ((const int * ALIGNED(64))a08)[11]; - b12.i[ 1] = ((const int * ALIGNED(64))a08)[12]; - b13.i[ 1] = ((const int * ALIGNED(64))a08)[13]; - b14.i[ 1] = ((const int * ALIGNED(64))a08)[14]; - b15.i[ 1] = ((const int * ALIGNED(64))a08)[15]; - - b08.i[ 2] = ((const int * ALIGNED(64))a09)[ 0]; - b09.i[ 2] = ((const int * ALIGNED(64))a09)[ 1]; - b10.i[ 2] = ((const int * ALIGNED(64))a09)[ 2]; - b11.i[ 2] = ((const int * ALIGNED(64))a09)[ 3]; - b12.i[ 2] = ((const int * ALIGNED(64))a09)[ 4]; - b13.i[ 2] = ((const int * ALIGNED(64))a09)[ 5]; - b14.i[ 2] = ((const int * ALIGNED(64))a09)[ 6]; - b15.i[ 2] = ((const int * ALIGNED(64))a09)[ 7]; - b08.i[ 3] = ((const int * ALIGNED(64))a09)[ 8]; - b09.i[ 3] = ((const int * ALIGNED(64))a09)[ 9]; - b10.i[ 3] = ((const int * ALIGNED(64))a09)[10]; - b11.i[ 3] = ((const int * ALIGNED(64))a09)[11]; - b12.i[ 3] = ((const int * ALIGNED(64))a09)[12]; - b13.i[ 3] = ((const int * ALIGNED(64))a09)[13]; - b14.i[ 3] = ((const int * ALIGNED(64))a09)[14]; - b15.i[ 3] = ((const int * ALIGNED(64))a09)[15]; - - b08.i[ 4] = ((const int * ALIGNED(64))a10)[ 0]; - b09.i[ 4] = ((const int * ALIGNED(64))a10)[ 1]; - b10.i[ 4] = ((const int * ALIGNED(64))a10)[ 2]; - b11.i[ 4] = ((const int * ALIGNED(64))a10)[ 3]; - b12.i[ 4] = ((const int * ALIGNED(64))a10)[ 4]; - b13.i[ 4] = ((const int * ALIGNED(64))a10)[ 5]; - b14.i[ 4] = ((const int * ALIGNED(64))a10)[ 6]; - b15.i[ 4] = ((const int * ALIGNED(64))a10)[ 7]; - b08.i[ 5] = ((const int * ALIGNED(64))a10)[ 8]; - b09.i[ 5] = ((const int * ALIGNED(64))a10)[ 9]; - b10.i[ 5] = ((const int * ALIGNED(64))a10)[10]; - b11.i[ 5] = ((const int * ALIGNED(64))a10)[11]; - b12.i[ 5] = ((const int * ALIGNED(64))a10)[12]; - b13.i[ 5] = ((const int * ALIGNED(64))a10)[13]; - b14.i[ 5] = ((const int * ALIGNED(64))a10)[14]; - b15.i[ 5] = ((const int * ALIGNED(64))a10)[15]; - - b08.i[ 6] = ((const int * ALIGNED(64))a11)[ 0]; - b09.i[ 6] = ((const int * ALIGNED(64))a11)[ 1]; - b10.i[ 6] = ((const int * ALIGNED(64))a11)[ 2]; - b11.i[ 6] = ((const int * ALIGNED(64))a11)[ 3]; - b12.i[ 6] = ((const int * ALIGNED(64))a11)[ 4]; - b13.i[ 6] = ((const int * ALIGNED(64))a11)[ 5]; - b14.i[ 6] = ((const int * ALIGNED(64))a11)[ 6]; - b15.i[ 6] = ((const int * ALIGNED(64))a11)[ 7]; - b08.i[ 7] = ((const int * ALIGNED(64))a11)[ 8]; - b09.i[ 7] = ((const int * ALIGNED(64))a11)[ 9]; - b10.i[ 7] = ((const int * ALIGNED(64))a11)[10]; - b11.i[ 7] = ((const int * ALIGNED(64))a11)[11]; - b12.i[ 7] = ((const int * ALIGNED(64))a11)[12]; - b13.i[ 7] = ((const int * ALIGNED(64))a11)[13]; - b14.i[ 7] = ((const int * ALIGNED(64))a11)[14]; - b15.i[ 7] = ((const int * ALIGNED(64))a11)[15]; - - b08.i[ 8] = ((const int * ALIGNED(64))a12)[ 0]; - b09.i[ 8] = ((const int * ALIGNED(64))a12)[ 1]; - b10.i[ 8] = ((const int * ALIGNED(64))a12)[ 2]; - b11.i[ 8] = ((const int * ALIGNED(64))a12)[ 3]; - b12.i[ 8] = ((const int * ALIGNED(64))a12)[ 4]; - b13.i[ 8] = ((const int * ALIGNED(64))a12)[ 5]; - b14.i[ 8] = ((const int * ALIGNED(64))a12)[ 6]; - b15.i[ 8] = ((const int * ALIGNED(64))a12)[ 7]; - b08.i[ 9] = ((const int * ALIGNED(64))a12)[ 8]; - b09.i[ 9] = ((const int * ALIGNED(64))a12)[ 9]; - b10.i[ 9] = ((const int * ALIGNED(64))a12)[10]; - b11.i[ 9] = ((const int * ALIGNED(64))a12)[11]; - b12.i[ 9] = ((const int * ALIGNED(64))a12)[12]; - b13.i[ 9] = ((const int * ALIGNED(64))a12)[13]; - b14.i[ 9] = ((const int * ALIGNED(64))a12)[14]; - b15.i[ 9] = ((const int * ALIGNED(64))a12)[15]; - - b08.i[10] = ((const int * ALIGNED(64))a13)[ 0]; - b09.i[10] = ((const int * ALIGNED(64))a13)[ 1]; - b10.i[10] = ((const int * ALIGNED(64))a13)[ 2]; - b11.i[10] = ((const int * ALIGNED(64))a13)[ 3]; - b12.i[10] = ((const int * ALIGNED(64))a13)[ 4]; - b13.i[10] = ((const int * ALIGNED(64))a13)[ 5]; - b14.i[10] = ((const int * ALIGNED(64))a13)[ 6]; - b15.i[10] = ((const int * ALIGNED(64))a13)[ 7]; - b08.i[11] = ((const int * ALIGNED(64))a13)[ 8]; - b09.i[11] = ((const int * ALIGNED(64))a13)[ 9]; - b10.i[11] = ((const int * ALIGNED(64))a13)[10]; - b11.i[11] = ((const int * ALIGNED(64))a13)[11]; - b12.i[11] = ((const int * ALIGNED(64))a13)[12]; - b13.i[11] = ((const int * ALIGNED(64))a13)[13]; - b14.i[11] = ((const int * ALIGNED(64))a13)[14]; - b15.i[11] = ((const int * ALIGNED(64))a13)[15]; - - b08.i[12] = ((const int * ALIGNED(64))a14)[ 0]; - b09.i[12] = ((const int * ALIGNED(64))a14)[ 1]; - b10.i[12] = ((const int * ALIGNED(64))a14)[ 2]; - b11.i[12] = ((const int * ALIGNED(64))a14)[ 3]; - b12.i[12] = ((const int * ALIGNED(64))a14)[ 4]; - b13.i[12] = ((const int * ALIGNED(64))a14)[ 5]; - b14.i[12] = ((const int * ALIGNED(64))a14)[ 6]; - b15.i[12] = ((const int * ALIGNED(64))a14)[ 7]; - b08.i[13] = ((const int * ALIGNED(64))a14)[ 8]; - b09.i[13] = ((const int * ALIGNED(64))a14)[ 9]; - b10.i[13] = ((const int * ALIGNED(64))a14)[10]; - b11.i[13] = ((const int * ALIGNED(64))a14)[11]; - b12.i[13] = ((const int * ALIGNED(64))a14)[12]; - b13.i[13] = ((const int * ALIGNED(64))a14)[13]; - b14.i[13] = ((const int * ALIGNED(64))a14)[14]; - b15.i[13] = ((const int * ALIGNED(64))a14)[15]; - - b08.i[14] = ((const int * ALIGNED(64))a15)[ 0]; - b09.i[14] = ((const int * ALIGNED(64))a15)[ 1]; - b10.i[14] = ((const int * ALIGNED(64))a15)[ 2]; - b11.i[14] = ((const int * ALIGNED(64))a15)[ 3]; - b12.i[14] = ((const int * ALIGNED(64))a15)[ 4]; - b13.i[14] = ((const int * ALIGNED(64))a15)[ 5]; - b14.i[14] = ((const int * ALIGNED(64))a15)[ 6]; - b15.i[14] = ((const int * ALIGNED(64))a15)[ 7]; - b08.i[15] = ((const int * ALIGNED(64))a15)[ 8]; - b09.i[15] = ((const int * ALIGNED(64))a15)[ 9]; - b10.i[15] = ((const int * ALIGNED(64))a15)[10]; - b11.i[15] = ((const int * ALIGNED(64))a15)[11]; - b12.i[15] = ((const int * ALIGNED(64))a15)[12]; - b13.i[15] = ((const int * ALIGNED(64))a15)[13]; - b14.i[15] = ((const int * ALIGNED(64))a15)[14]; - b15.i[15] = ((const int * ALIGNED(64))a15)[15]; - } - - inline void store_16x1_tr( const v16 &a, - void *a00, void *a01, void *a02, void *a03, - void *a04, void *a05, void *a06, void *a07, - void *a08, void *a09, void *a10, void *a11, - void *a12, void *a13, void *a14, void *a15 ) - { - ((int *)a00)[0] = a.i[ 0]; - ((int *)a01)[0] = a.i[ 1]; - ((int *)a02)[0] = a.i[ 2]; - ((int *)a03)[0] = a.i[ 3]; - ((int *)a04)[0] = a.i[ 4]; - ((int *)a05)[0] = a.i[ 5]; - ((int *)a06)[0] = a.i[ 6]; - ((int *)a07)[0] = a.i[ 7]; - ((int *)a08)[0] = a.i[ 8]; - ((int *)a09)[0] = a.i[ 9]; - ((int *)a10)[0] = a.i[10]; - ((int *)a11)[0] = a.i[11]; - ((int *)a12)[0] = a.i[12]; - ((int *)a13)[0] = a.i[13]; - ((int *)a14)[0] = a.i[14]; - ((int *)a15)[0] = a.i[15]; - } - - inline void store_16x2_tr( const v16 &a, const v16 &b, - void * ALIGNED(8) a00, void * ALIGNED(8) a01, - void * ALIGNED(8) a02, void * ALIGNED(8) a03, - void * ALIGNED(8) a04, void * ALIGNED(8) a05, - void * ALIGNED(8) a06, void * ALIGNED(8) a07, - void * ALIGNED(8) a08, void * ALIGNED(8) a09, - void * ALIGNED(8) a10, void * ALIGNED(8) a11, - void * ALIGNED(8) a12, void * ALIGNED(8) a13, - void * ALIGNED(8) a14, void * ALIGNED(8) a15 ) - { - ((int * ALIGNED(8))a00)[0] = a.i[ 0]; - ((int * ALIGNED(8))a00)[1] = b.i[ 0]; - - ((int * ALIGNED(8))a01)[0] = a.i[ 1]; - ((int * ALIGNED(8))a01)[1] = b.i[ 1]; - - ((int * ALIGNED(8))a02)[0] = a.i[ 2]; - ((int * ALIGNED(8))a02)[1] = b.i[ 2]; - - ((int * ALIGNED(8))a03)[0] = a.i[ 3]; - ((int * ALIGNED(8))a03)[1] = b.i[ 3]; - - ((int * ALIGNED(8))a04)[0] = a.i[ 4]; - ((int * ALIGNED(8))a04)[1] = b.i[ 4]; - - ((int * ALIGNED(8))a05)[0] = a.i[ 5]; - ((int * ALIGNED(8))a05)[1] = b.i[ 5]; - - ((int * ALIGNED(8))a06)[0] = a.i[ 6]; - ((int * ALIGNED(8))a06)[1] = b.i[ 6]; - - ((int * ALIGNED(8))a07)[0] = a.i[ 7]; - ((int * ALIGNED(8))a07)[1] = b.i[ 7]; - - ((int * ALIGNED(8))a08)[0] = a.i[ 8]; - ((int * ALIGNED(8))a08)[1] = b.i[ 8]; - - ((int * ALIGNED(8))a09)[0] = a.i[ 9]; - ((int * ALIGNED(8))a09)[1] = b.i[ 9]; - - ((int * ALIGNED(8))a10)[0] = a.i[10]; - ((int * ALIGNED(8))a10)[1] = b.i[10]; - - ((int * ALIGNED(8))a11)[0] = a.i[11]; - ((int * ALIGNED(8))a11)[1] = b.i[11]; - - ((int * ALIGNED(8))a12)[0] = a.i[12]; - ((int * ALIGNED(8))a12)[1] = b.i[12]; - - ((int * ALIGNED(8))a13)[0] = a.i[13]; - ((int * ALIGNED(8))a13)[1] = b.i[13]; - - ((int * ALIGNED(8))a14)[0] = a.i[14]; - ((int * ALIGNED(8))a14)[1] = b.i[14]; - - ((int * ALIGNED(8))a15)[0] = a.i[15]; - ((int * ALIGNED(8))a15)[1] = b.i[15]; - } - - inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - } - - inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - } - - inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d, - const v16 &e, const v16 &f, const v16 &g, const v16 &h, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[0] = a.i[ 0]; - ((int * ALIGNED(64))a00)[1] = b.i[ 0]; - ((int * ALIGNED(64))a00)[2] = c.i[ 0]; - ((int * ALIGNED(64))a00)[3] = d.i[ 0]; - ((int * ALIGNED(64))a00)[4] = e.i[ 0]; - ((int * ALIGNED(64))a00)[5] = f.i[ 0]; - ((int * ALIGNED(64))a00)[6] = g.i[ 0]; - ((int * ALIGNED(64))a00)[7] = h.i[ 0]; - - ((int * ALIGNED(64))a01)[0] = a.i[ 1]; - ((int * ALIGNED(64))a01)[1] = b.i[ 1]; - ((int * ALIGNED(64))a01)[2] = c.i[ 1]; - ((int * ALIGNED(64))a01)[3] = d.i[ 1]; - ((int * ALIGNED(64))a01)[4] = e.i[ 1]; - ((int * ALIGNED(64))a01)[5] = f.i[ 1]; - ((int * ALIGNED(64))a01)[6] = g.i[ 1]; - ((int * ALIGNED(64))a01)[7] = h.i[ 1]; - - ((int * ALIGNED(64))a02)[0] = a.i[ 2]; - ((int * ALIGNED(64))a02)[1] = b.i[ 2]; - ((int * ALIGNED(64))a02)[2] = c.i[ 2]; - ((int * ALIGNED(64))a02)[3] = d.i[ 2]; - ((int * ALIGNED(64))a02)[4] = e.i[ 2]; - ((int * ALIGNED(64))a02)[5] = f.i[ 2]; - ((int * ALIGNED(64))a02)[6] = g.i[ 2]; - ((int * ALIGNED(64))a02)[7] = h.i[ 2]; - - ((int * ALIGNED(64))a03)[0] = a.i[ 3]; - ((int * ALIGNED(64))a03)[1] = b.i[ 3]; - ((int * ALIGNED(64))a03)[2] = c.i[ 3]; - ((int * ALIGNED(64))a03)[3] = d.i[ 3]; - ((int * ALIGNED(64))a03)[4] = e.i[ 3]; - ((int * ALIGNED(64))a03)[5] = f.i[ 3]; - ((int * ALIGNED(64))a03)[6] = g.i[ 3]; - ((int * ALIGNED(64))a03)[7] = h.i[ 3]; - - ((int * ALIGNED(64))a04)[0] = a.i[ 4]; - ((int * ALIGNED(64))a04)[1] = b.i[ 4]; - ((int * ALIGNED(64))a04)[2] = c.i[ 4]; - ((int * ALIGNED(64))a04)[3] = d.i[ 4]; - ((int * ALIGNED(64))a04)[4] = e.i[ 4]; - ((int * ALIGNED(64))a04)[5] = f.i[ 4]; - ((int * ALIGNED(64))a04)[6] = g.i[ 4]; - ((int * ALIGNED(64))a04)[7] = h.i[ 4]; - - ((int * ALIGNED(64))a05)[0] = a.i[ 5]; - ((int * ALIGNED(64))a05)[1] = b.i[ 5]; - ((int * ALIGNED(64))a05)[2] = c.i[ 5]; - ((int * ALIGNED(64))a05)[3] = d.i[ 5]; - ((int * ALIGNED(64))a05)[4] = e.i[ 5]; - ((int * ALIGNED(64))a05)[5] = f.i[ 5]; - ((int * ALIGNED(64))a05)[6] = g.i[ 5]; - ((int * ALIGNED(64))a05)[7] = h.i[ 5]; - - ((int * ALIGNED(64))a06)[0] = a.i[ 6]; - ((int * ALIGNED(64))a06)[1] = b.i[ 6]; - ((int * ALIGNED(64))a06)[2] = c.i[ 6]; - ((int * ALIGNED(64))a06)[3] = d.i[ 6]; - ((int * ALIGNED(64))a06)[4] = e.i[ 6]; - ((int * ALIGNED(64))a06)[5] = f.i[ 6]; - ((int * ALIGNED(64))a06)[6] = g.i[ 6]; - ((int * ALIGNED(64))a06)[7] = h.i[ 6]; - - ((int * ALIGNED(64))a07)[0] = a.i[ 7]; - ((int * ALIGNED(64))a07)[1] = b.i[ 7]; - ((int * ALIGNED(64))a07)[2] = c.i[ 7]; - ((int * ALIGNED(64))a07)[3] = d.i[ 7]; - ((int * ALIGNED(64))a07)[4] = e.i[ 7]; - ((int * ALIGNED(64))a07)[5] = f.i[ 7]; - ((int * ALIGNED(64))a07)[6] = g.i[ 7]; - ((int * ALIGNED(64))a07)[7] = h.i[ 7]; - - ((int * ALIGNED(64))a08)[0] = a.i[ 8]; - ((int * ALIGNED(64))a08)[1] = b.i[ 8]; - ((int * ALIGNED(64))a08)[2] = c.i[ 8]; - ((int * ALIGNED(64))a08)[3] = d.i[ 8]; - ((int * ALIGNED(64))a08)[4] = e.i[ 8]; - ((int * ALIGNED(64))a08)[5] = f.i[ 8]; - ((int * ALIGNED(64))a08)[6] = g.i[ 8]; - ((int * ALIGNED(64))a08)[7] = h.i[ 8]; - - ((int * ALIGNED(64))a09)[0] = a.i[ 9]; - ((int * ALIGNED(64))a09)[1] = b.i[ 9]; - ((int * ALIGNED(64))a09)[2] = c.i[ 9]; - ((int * ALIGNED(64))a09)[3] = d.i[ 9]; - ((int * ALIGNED(64))a09)[4] = e.i[ 9]; - ((int * ALIGNED(64))a09)[5] = f.i[ 9]; - ((int * ALIGNED(64))a09)[6] = g.i[ 9]; - ((int * ALIGNED(64))a09)[7] = h.i[ 9]; - - ((int * ALIGNED(64))a10)[0] = a.i[10]; - ((int * ALIGNED(64))a10)[1] = b.i[10]; - ((int * ALIGNED(64))a10)[2] = c.i[10]; - ((int * ALIGNED(64))a10)[3] = d.i[10]; - ((int * ALIGNED(64))a10)[4] = e.i[10]; - ((int * ALIGNED(64))a10)[5] = f.i[10]; - ((int * ALIGNED(64))a10)[6] = g.i[10]; - ((int * ALIGNED(64))a10)[7] = h.i[10]; - - ((int * ALIGNED(64))a11)[0] = a.i[11]; - ((int * ALIGNED(64))a11)[1] = b.i[11]; - ((int * ALIGNED(64))a11)[2] = c.i[11]; - ((int * ALIGNED(64))a11)[3] = d.i[11]; - ((int * ALIGNED(64))a11)[4] = e.i[11]; - ((int * ALIGNED(64))a11)[5] = f.i[11]; - ((int * ALIGNED(64))a11)[6] = g.i[11]; - ((int * ALIGNED(64))a11)[7] = h.i[11]; - - ((int * ALIGNED(64))a12)[0] = a.i[12]; - ((int * ALIGNED(64))a12)[1] = b.i[12]; - ((int * ALIGNED(64))a12)[2] = c.i[12]; - ((int * ALIGNED(64))a12)[3] = d.i[12]; - ((int * ALIGNED(64))a12)[4] = e.i[12]; - ((int * ALIGNED(64))a12)[5] = f.i[12]; - ((int * ALIGNED(64))a12)[6] = g.i[12]; - ((int * ALIGNED(64))a12)[7] = h.i[12]; - - ((int * ALIGNED(64))a13)[0] = a.i[13]; - ((int * ALIGNED(64))a13)[1] = b.i[13]; - ((int * ALIGNED(64))a13)[2] = c.i[13]; - ((int * ALIGNED(64))a13)[3] = d.i[13]; - ((int * ALIGNED(64))a13)[4] = e.i[13]; - ((int * ALIGNED(64))a13)[5] = f.i[13]; - ((int * ALIGNED(64))a13)[6] = g.i[13]; - ((int * ALIGNED(64))a13)[7] = h.i[13]; - - ((int * ALIGNED(64))a14)[0] = a.i[14]; - ((int * ALIGNED(64))a14)[1] = b.i[14]; - ((int * ALIGNED(64))a14)[2] = c.i[14]; - ((int * ALIGNED(64))a14)[3] = d.i[14]; - ((int * ALIGNED(64))a14)[4] = e.i[14]; - ((int * ALIGNED(64))a14)[5] = f.i[14]; - ((int * ALIGNED(64))a14)[6] = g.i[14]; - ((int * ALIGNED(64))a14)[7] = h.i[14]; - - ((int * ALIGNED(64))a15)[0] = a.i[15]; - ((int * ALIGNED(64))a15)[1] = b.i[15]; - ((int * ALIGNED(64))a15)[2] = c.i[15]; - ((int * ALIGNED(64))a15)[3] = d.i[15]; - ((int * ALIGNED(64))a15)[4] = e.i[15]; - ((int * ALIGNED(64))a15)[5] = f.i[15]; - ((int * ALIGNED(64))a15)[6] = g.i[15]; - ((int * ALIGNED(64))a15)[7] = h.i[15]; - } - - inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b08.i[ 0]; - ((int * ALIGNED(64))a00)[ 9] = b09.i[ 0]; - ((int * ALIGNED(64))a00)[10] = b10.i[ 0]; - ((int * ALIGNED(64))a00)[11] = b11.i[ 0]; - ((int * ALIGNED(64))a00)[12] = b12.i[ 0]; - ((int * ALIGNED(64))a00)[13] = b13.i[ 0]; - ((int * ALIGNED(64))a00)[14] = b14.i[ 0]; - ((int * ALIGNED(64))a00)[15] = b15.i[ 0]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 1]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 1]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 1]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 1]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 1]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 1]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 1]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 1]; - ((int * ALIGNED(64))a01)[ 8] = b08.i[ 1]; - ((int * ALIGNED(64))a01)[ 9] = b09.i[ 1]; - ((int * ALIGNED(64))a01)[10] = b10.i[ 1]; - ((int * ALIGNED(64))a01)[11] = b11.i[ 1]; - ((int * ALIGNED(64))a01)[12] = b12.i[ 1]; - ((int * ALIGNED(64))a01)[13] = b13.i[ 1]; - ((int * ALIGNED(64))a01)[14] = b14.i[ 1]; - ((int * ALIGNED(64))a01)[15] = b15.i[ 1]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a02)[ 8] = b08.i[ 2]; - ((int * ALIGNED(64))a02)[ 9] = b09.i[ 2]; - ((int * ALIGNED(64))a02)[10] = b10.i[ 2]; - ((int * ALIGNED(64))a02)[11] = b11.i[ 2]; - ((int * ALIGNED(64))a02)[12] = b12.i[ 2]; - ((int * ALIGNED(64))a02)[13] = b13.i[ 2]; - ((int * ALIGNED(64))a02)[14] = b14.i[ 2]; - ((int * ALIGNED(64))a02)[15] = b15.i[ 2]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 3]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 3]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 3]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 3]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 3]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 3]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 3]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 3]; - ((int * ALIGNED(64))a03)[ 8] = b08.i[ 3]; - ((int * ALIGNED(64))a03)[ 9] = b09.i[ 3]; - ((int * ALIGNED(64))a03)[10] = b10.i[ 3]; - ((int * ALIGNED(64))a03)[11] = b11.i[ 3]; - ((int * ALIGNED(64))a03)[12] = b12.i[ 3]; - ((int * ALIGNED(64))a03)[13] = b13.i[ 3]; - ((int * ALIGNED(64))a03)[14] = b14.i[ 3]; - ((int * ALIGNED(64))a03)[15] = b15.i[ 3]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a04)[ 8] = b08.i[ 4]; - ((int * ALIGNED(64))a04)[ 9] = b09.i[ 4]; - ((int * ALIGNED(64))a04)[10] = b10.i[ 4]; - ((int * ALIGNED(64))a04)[11] = b11.i[ 4]; - ((int * ALIGNED(64))a04)[12] = b12.i[ 4]; - ((int * ALIGNED(64))a04)[13] = b13.i[ 4]; - ((int * ALIGNED(64))a04)[14] = b14.i[ 4]; - ((int * ALIGNED(64))a04)[15] = b15.i[ 4]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[ 5]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[ 5]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[ 5]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[ 5]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[ 5]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[ 5]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[ 5]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[ 5]; - ((int * ALIGNED(64))a05)[ 8] = b08.i[ 5]; - ((int * ALIGNED(64))a05)[ 9] = b09.i[ 5]; - ((int * ALIGNED(64))a05)[10] = b10.i[ 5]; - ((int * ALIGNED(64))a05)[11] = b11.i[ 5]; - ((int * ALIGNED(64))a05)[12] = b12.i[ 5]; - ((int * ALIGNED(64))a05)[13] = b13.i[ 5]; - ((int * ALIGNED(64))a05)[14] = b14.i[ 5]; - ((int * ALIGNED(64))a05)[15] = b15.i[ 5]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a06)[ 8] = b08.i[ 6]; - ((int * ALIGNED(64))a06)[ 9] = b09.i[ 6]; - ((int * ALIGNED(64))a06)[10] = b10.i[ 6]; - ((int * ALIGNED(64))a06)[11] = b11.i[ 6]; - ((int * ALIGNED(64))a06)[12] = b12.i[ 6]; - ((int * ALIGNED(64))a06)[13] = b13.i[ 6]; - ((int * ALIGNED(64))a06)[14] = b14.i[ 6]; - ((int * ALIGNED(64))a06)[15] = b15.i[ 6]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[ 7]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[ 7]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[ 7]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[ 7]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[ 7]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[ 7]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[ 7]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[ 7]; - ((int * ALIGNED(64))a07)[ 8] = b08.i[ 7]; - ((int * ALIGNED(64))a07)[ 9] = b09.i[ 7]; - ((int * ALIGNED(64))a07)[10] = b10.i[ 7]; - ((int * ALIGNED(64))a07)[11] = b11.i[ 7]; - ((int * ALIGNED(64))a07)[12] = b12.i[ 7]; - ((int * ALIGNED(64))a07)[13] = b13.i[ 7]; - ((int * ALIGNED(64))a07)[14] = b14.i[ 7]; - ((int * ALIGNED(64))a07)[15] = b15.i[ 7]; - - ((int * ALIGNED(64))a08)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a08)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a08)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a08)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a08)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a08)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a08)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a08)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a08)[ 8] = b08.i[ 8]; - ((int * ALIGNED(64))a08)[ 9] = b09.i[ 8]; - ((int * ALIGNED(64))a08)[10] = b10.i[ 8]; - ((int * ALIGNED(64))a08)[11] = b11.i[ 8]; - ((int * ALIGNED(64))a08)[12] = b12.i[ 8]; - ((int * ALIGNED(64))a08)[13] = b13.i[ 8]; - ((int * ALIGNED(64))a08)[14] = b14.i[ 8]; - ((int * ALIGNED(64))a08)[15] = b15.i[ 8]; - - ((int * ALIGNED(64))a09)[ 0] = b00.i[ 9]; - ((int * ALIGNED(64))a09)[ 1] = b01.i[ 9]; - ((int * ALIGNED(64))a09)[ 2] = b02.i[ 9]; - ((int * ALIGNED(64))a09)[ 3] = b03.i[ 9]; - ((int * ALIGNED(64))a09)[ 4] = b04.i[ 9]; - ((int * ALIGNED(64))a09)[ 5] = b05.i[ 9]; - ((int * ALIGNED(64))a09)[ 6] = b06.i[ 9]; - ((int * ALIGNED(64))a09)[ 7] = b07.i[ 9]; - ((int * ALIGNED(64))a09)[ 8] = b08.i[ 9]; - ((int * ALIGNED(64))a09)[ 9] = b09.i[ 9]; - ((int * ALIGNED(64))a09)[10] = b10.i[ 9]; - ((int * ALIGNED(64))a09)[11] = b11.i[ 9]; - ((int * ALIGNED(64))a09)[12] = b12.i[ 9]; - ((int * ALIGNED(64))a09)[13] = b13.i[ 9]; - ((int * ALIGNED(64))a09)[14] = b14.i[ 9]; - ((int * ALIGNED(64))a09)[15] = b15.i[ 9]; - - ((int * ALIGNED(64))a10)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a10)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a10)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a10)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a10)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a10)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a10)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a10)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a10)[ 8] = b08.i[10]; - ((int * ALIGNED(64))a10)[ 9] = b09.i[10]; - ((int * ALIGNED(64))a10)[10] = b10.i[10]; - ((int * ALIGNED(64))a10)[11] = b11.i[10]; - ((int * ALIGNED(64))a10)[12] = b12.i[10]; - ((int * ALIGNED(64))a10)[13] = b13.i[10]; - ((int * ALIGNED(64))a10)[14] = b14.i[10]; - ((int * ALIGNED(64))a10)[15] = b15.i[10]; - - ((int * ALIGNED(64))a11)[ 0] = b00.i[11]; - ((int * ALIGNED(64))a11)[ 1] = b01.i[11]; - ((int * ALIGNED(64))a11)[ 2] = b02.i[11]; - ((int * ALIGNED(64))a11)[ 3] = b03.i[11]; - ((int * ALIGNED(64))a11)[ 4] = b04.i[11]; - ((int * ALIGNED(64))a11)[ 5] = b05.i[11]; - ((int * ALIGNED(64))a11)[ 6] = b06.i[11]; - ((int * ALIGNED(64))a11)[ 7] = b07.i[11]; - ((int * ALIGNED(64))a11)[ 8] = b08.i[11]; - ((int * ALIGNED(64))a11)[ 9] = b09.i[11]; - ((int * ALIGNED(64))a11)[10] = b10.i[11]; - ((int * ALIGNED(64))a11)[11] = b11.i[11]; - ((int * ALIGNED(64))a11)[12] = b12.i[11]; - ((int * ALIGNED(64))a11)[13] = b13.i[11]; - ((int * ALIGNED(64))a11)[14] = b14.i[11]; - ((int * ALIGNED(64))a11)[15] = b15.i[11]; - - ((int * ALIGNED(64))a12)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a12)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a12)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a12)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a12)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a12)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a12)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a12)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a12)[ 8] = b08.i[12]; - ((int * ALIGNED(64))a12)[ 9] = b09.i[12]; - ((int * ALIGNED(64))a12)[10] = b10.i[12]; - ((int * ALIGNED(64))a12)[11] = b11.i[12]; - ((int * ALIGNED(64))a12)[12] = b12.i[12]; - ((int * ALIGNED(64))a12)[13] = b13.i[12]; - ((int * ALIGNED(64))a12)[14] = b14.i[12]; - ((int * ALIGNED(64))a12)[15] = b15.i[12]; - - ((int * ALIGNED(64))a13)[ 0] = b00.i[13]; - ((int * ALIGNED(64))a13)[ 1] = b01.i[13]; - ((int * ALIGNED(64))a13)[ 2] = b02.i[13]; - ((int * ALIGNED(64))a13)[ 3] = b03.i[13]; - ((int * ALIGNED(64))a13)[ 4] = b04.i[13]; - ((int * ALIGNED(64))a13)[ 5] = b05.i[13]; - ((int * ALIGNED(64))a13)[ 6] = b06.i[13]; - ((int * ALIGNED(64))a13)[ 7] = b07.i[13]; - ((int * ALIGNED(64))a13)[ 8] = b08.i[13]; - ((int * ALIGNED(64))a13)[ 9] = b09.i[13]; - ((int * ALIGNED(64))a13)[10] = b10.i[13]; - ((int * ALIGNED(64))a13)[11] = b11.i[13]; - ((int * ALIGNED(64))a13)[12] = b12.i[13]; - ((int * ALIGNED(64))a13)[13] = b13.i[13]; - ((int * ALIGNED(64))a13)[14] = b14.i[13]; - ((int * ALIGNED(64))a13)[15] = b15.i[13]; - - ((int * ALIGNED(64))a14)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a14)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a14)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a14)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a14)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a14)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a14)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a14)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a14)[ 8] = b08.i[14]; - ((int * ALIGNED(64))a14)[ 9] = b09.i[14]; - ((int * ALIGNED(64))a14)[10] = b10.i[14]; - ((int * ALIGNED(64))a14)[11] = b11.i[14]; - ((int * ALIGNED(64))a14)[12] = b12.i[14]; - ((int * ALIGNED(64))a14)[13] = b13.i[14]; - ((int * ALIGNED(64))a14)[14] = b14.i[14]; - ((int * ALIGNED(64))a14)[15] = b15.i[14]; - - ((int * ALIGNED(64))a15)[ 0] = b00.i[15]; - ((int * ALIGNED(64))a15)[ 1] = b01.i[15]; - ((int * ALIGNED(64))a15)[ 2] = b02.i[15]; - ((int * ALIGNED(64))a15)[ 3] = b03.i[15]; - ((int * ALIGNED(64))a15)[ 4] = b04.i[15]; - ((int * ALIGNED(64))a15)[ 5] = b05.i[15]; - ((int * ALIGNED(64))a15)[ 6] = b06.i[15]; - ((int * ALIGNED(64))a15)[ 7] = b07.i[15]; - ((int * ALIGNED(64))a15)[ 8] = b08.i[15]; - ((int * ALIGNED(64))a15)[ 9] = b09.i[15]; - ((int * ALIGNED(64))a15)[10] = b10.i[15]; - ((int * ALIGNED(64))a15)[11] = b11.i[15]; - ((int * ALIGNED(64))a15)[12] = b12.i[15]; - ((int * ALIGNED(64))a15)[13] = b13.i[15]; - ((int * ALIGNED(64))a15)[14] = b14.i[15]; - ((int * ALIGNED(64))a15)[15] = b15.i[15]; - } - - inline void store_16x8_tr_p( const v16 &b00, - const v16 &b01, - const v16 &b02, - const v16 &b03, - const v16 &b04, - const v16 &b05, - const v16 &b06, - const v16 &b07, - void * ALIGNED(64) a00, - void * ALIGNED(64) a01, - void * ALIGNED(64) a02, - void * ALIGNED(64) a03, - void * ALIGNED(64) a04, - void * ALIGNED(64) a05, - void * ALIGNED(64) a06, - void * ALIGNED(64) a07 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1]; - ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1]; - ((int * ALIGNED(64))a00)[10] = b02.i[ 1]; - ((int * ALIGNED(64))a00)[11] = b03.i[ 1]; - ((int * ALIGNED(64))a00)[12] = b04.i[ 1]; - ((int * ALIGNED(64))a00)[13] = b05.i[ 1]; - ((int * ALIGNED(64))a00)[14] = b06.i[ 1]; - ((int * ALIGNED(64))a00)[15] = b07.i[ 1]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3]; - ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3]; - ((int * ALIGNED(64))a01)[10] = b02.i[ 3]; - ((int * ALIGNED(64))a01)[11] = b03.i[ 3]; - ((int * ALIGNED(64))a01)[12] = b04.i[ 3]; - ((int * ALIGNED(64))a01)[13] = b05.i[ 3]; - ((int * ALIGNED(64))a01)[14] = b06.i[ 3]; - ((int * ALIGNED(64))a01)[15] = b07.i[ 3]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5]; - ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5]; - ((int * ALIGNED(64))a02)[10] = b02.i[ 5]; - ((int * ALIGNED(64))a02)[11] = b03.i[ 5]; - ((int * ALIGNED(64))a02)[12] = b04.i[ 5]; - ((int * ALIGNED(64))a02)[13] = b05.i[ 5]; - ((int * ALIGNED(64))a02)[14] = b06.i[ 5]; - ((int * ALIGNED(64))a02)[15] = b07.i[ 5]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7]; - ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7]; - ((int * ALIGNED(64))a03)[10] = b02.i[ 7]; - ((int * ALIGNED(64))a03)[11] = b03.i[ 7]; - ((int * ALIGNED(64))a03)[12] = b04.i[ 7]; - ((int * ALIGNED(64))a03)[13] = b05.i[ 7]; - ((int * ALIGNED(64))a03)[14] = b06.i[ 7]; - ((int * ALIGNED(64))a03)[15] = b07.i[ 7]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9]; - ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9]; - ((int * ALIGNED(64))a04)[10] = b02.i[ 9]; - ((int * ALIGNED(64))a04)[11] = b03.i[ 9]; - ((int * ALIGNED(64))a04)[12] = b04.i[ 9]; - ((int * ALIGNED(64))a04)[13] = b05.i[ 9]; - ((int * ALIGNED(64))a04)[14] = b06.i[ 9]; - ((int * ALIGNED(64))a04)[15] = b07.i[ 9]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a05)[ 8] = b00.i[11]; - ((int * ALIGNED(64))a05)[ 9] = b01.i[11]; - ((int * ALIGNED(64))a05)[10] = b02.i[11]; - ((int * ALIGNED(64))a05)[11] = b03.i[11]; - ((int * ALIGNED(64))a05)[12] = b04.i[11]; - ((int * ALIGNED(64))a05)[13] = b05.i[11]; - ((int * ALIGNED(64))a05)[14] = b06.i[11]; - ((int * ALIGNED(64))a05)[15] = b07.i[11]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a06)[ 8] = b00.i[13]; - ((int * ALIGNED(64))a06)[ 9] = b01.i[13]; - ((int * ALIGNED(64))a06)[10] = b02.i[13]; - ((int * ALIGNED(64))a06)[11] = b03.i[13]; - ((int * ALIGNED(64))a06)[12] = b04.i[13]; - ((int * ALIGNED(64))a06)[13] = b05.i[13]; - ((int * ALIGNED(64))a06)[14] = b06.i[13]; - ((int * ALIGNED(64))a06)[15] = b07.i[13]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a07)[ 8] = b00.i[15]; - ((int * ALIGNED(64))a07)[ 9] = b01.i[15]; - ((int * ALIGNED(64))a07)[10] = b02.i[15]; - ((int * ALIGNED(64))a07)[11] = b03.i[15]; - ((int * ALIGNED(64))a07)[12] = b04.i[15]; - ((int * ALIGNED(64))a07)[13] = b05.i[15]; - ((int * ALIGNED(64))a07)[14] = b06.i[15]; - ((int * ALIGNED(64))a07)[15] = b07.i[15]; - } - - inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03, - const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07, - const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11, - const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15, - void * ALIGNED(64) a00, void * ALIGNED(64) a01, - void * ALIGNED(64) a02, void * ALIGNED(64) a03, - void * ALIGNED(64) a04, void * ALIGNED(64) a05, - void * ALIGNED(64) a06, void * ALIGNED(64) a07, - void * ALIGNED(64) a08, void * ALIGNED(64) a09, - void * ALIGNED(64) a10, void * ALIGNED(64) a11, - void * ALIGNED(64) a12, void * ALIGNED(64) a13, - void * ALIGNED(64) a14, void * ALIGNED(64) a15 ) - { - ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0]; - ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0]; - ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0]; - ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0]; - ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0]; - ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0]; - ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0]; - ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0]; - ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1]; - ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1]; - ((int * ALIGNED(64))a00)[10] = b02.i[ 1]; - ((int * ALIGNED(64))a00)[11] = b03.i[ 1]; - ((int * ALIGNED(64))a00)[12] = b04.i[ 1]; - ((int * ALIGNED(64))a00)[13] = b05.i[ 1]; - ((int * ALIGNED(64))a00)[14] = b06.i[ 1]; - ((int * ALIGNED(64))a00)[15] = b07.i[ 1]; - - ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2]; - ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2]; - ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2]; - ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2]; - ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2]; - ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2]; - ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2]; - ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2]; - ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3]; - ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3]; - ((int * ALIGNED(64))a01)[10] = b02.i[ 3]; - ((int * ALIGNED(64))a01)[11] = b03.i[ 3]; - ((int * ALIGNED(64))a01)[12] = b04.i[ 3]; - ((int * ALIGNED(64))a01)[13] = b05.i[ 3]; - ((int * ALIGNED(64))a01)[14] = b06.i[ 3]; - ((int * ALIGNED(64))a01)[15] = b07.i[ 3]; - - ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4]; - ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4]; - ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4]; - ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4]; - ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4]; - ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4]; - ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4]; - ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4]; - ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5]; - ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5]; - ((int * ALIGNED(64))a02)[10] = b02.i[ 5]; - ((int * ALIGNED(64))a02)[11] = b03.i[ 5]; - ((int * ALIGNED(64))a02)[12] = b04.i[ 5]; - ((int * ALIGNED(64))a02)[13] = b05.i[ 5]; - ((int * ALIGNED(64))a02)[14] = b06.i[ 5]; - ((int * ALIGNED(64))a02)[15] = b07.i[ 5]; - - ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6]; - ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6]; - ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6]; - ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6]; - ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6]; - ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6]; - ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6]; - ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6]; - ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7]; - ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7]; - ((int * ALIGNED(64))a03)[10] = b02.i[ 7]; - ((int * ALIGNED(64))a03)[11] = b03.i[ 7]; - ((int * ALIGNED(64))a03)[12] = b04.i[ 7]; - ((int * ALIGNED(64))a03)[13] = b05.i[ 7]; - ((int * ALIGNED(64))a03)[14] = b06.i[ 7]; - ((int * ALIGNED(64))a03)[15] = b07.i[ 7]; - - ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8]; - ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8]; - ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8]; - ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8]; - ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8]; - ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8]; - ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8]; - ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8]; - ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9]; - ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9]; - ((int * ALIGNED(64))a04)[10] = b02.i[ 9]; - ((int * ALIGNED(64))a04)[11] = b03.i[ 9]; - ((int * ALIGNED(64))a04)[12] = b04.i[ 9]; - ((int * ALIGNED(64))a04)[13] = b05.i[ 9]; - ((int * ALIGNED(64))a04)[14] = b06.i[ 9]; - ((int * ALIGNED(64))a04)[15] = b07.i[ 9]; - - ((int * ALIGNED(64))a05)[ 0] = b00.i[10]; - ((int * ALIGNED(64))a05)[ 1] = b01.i[10]; - ((int * ALIGNED(64))a05)[ 2] = b02.i[10]; - ((int * ALIGNED(64))a05)[ 3] = b03.i[10]; - ((int * ALIGNED(64))a05)[ 4] = b04.i[10]; - ((int * ALIGNED(64))a05)[ 5] = b05.i[10]; - ((int * ALIGNED(64))a05)[ 6] = b06.i[10]; - ((int * ALIGNED(64))a05)[ 7] = b07.i[10]; - ((int * ALIGNED(64))a05)[ 8] = b00.i[11]; - ((int * ALIGNED(64))a05)[ 9] = b01.i[11]; - ((int * ALIGNED(64))a05)[10] = b02.i[11]; - ((int * ALIGNED(64))a05)[11] = b03.i[11]; - ((int * ALIGNED(64))a05)[12] = b04.i[11]; - ((int * ALIGNED(64))a05)[13] = b05.i[11]; - ((int * ALIGNED(64))a05)[14] = b06.i[11]; - ((int * ALIGNED(64))a05)[15] = b07.i[11]; - - ((int * ALIGNED(64))a06)[ 0] = b00.i[12]; - ((int * ALIGNED(64))a06)[ 1] = b01.i[12]; - ((int * ALIGNED(64))a06)[ 2] = b02.i[12]; - ((int * ALIGNED(64))a06)[ 3] = b03.i[12]; - ((int * ALIGNED(64))a06)[ 4] = b04.i[12]; - ((int * ALIGNED(64))a06)[ 5] = b05.i[12]; - ((int * ALIGNED(64))a06)[ 6] = b06.i[12]; - ((int * ALIGNED(64))a06)[ 7] = b07.i[12]; - ((int * ALIGNED(64))a06)[ 8] = b00.i[13]; - ((int * ALIGNED(64))a06)[ 9] = b01.i[13]; - ((int * ALIGNED(64))a06)[10] = b02.i[13]; - ((int * ALIGNED(64))a06)[11] = b03.i[13]; - ((int * ALIGNED(64))a06)[12] = b04.i[13]; - ((int * ALIGNED(64))a06)[13] = b05.i[13]; - ((int * ALIGNED(64))a06)[14] = b06.i[13]; - ((int * ALIGNED(64))a06)[15] = b07.i[13]; - - ((int * ALIGNED(64))a07)[ 0] = b00.i[14]; - ((int * ALIGNED(64))a07)[ 1] = b01.i[14]; - ((int * ALIGNED(64))a07)[ 2] = b02.i[14]; - ((int * ALIGNED(64))a07)[ 3] = b03.i[14]; - ((int * ALIGNED(64))a07)[ 4] = b04.i[14]; - ((int * ALIGNED(64))a07)[ 5] = b05.i[14]; - ((int * ALIGNED(64))a07)[ 6] = b06.i[14]; - ((int * ALIGNED(64))a07)[ 7] = b07.i[14]; - ((int * ALIGNED(64))a07)[ 8] = b00.i[15]; - ((int * ALIGNED(64))a07)[ 9] = b01.i[15]; - ((int * ALIGNED(64))a07)[10] = b02.i[15]; - ((int * ALIGNED(64))a07)[11] = b03.i[15]; - ((int * ALIGNED(64))a07)[12] = b04.i[15]; - ((int * ALIGNED(64))a07)[13] = b05.i[15]; - ((int * ALIGNED(64))a07)[14] = b06.i[15]; - ((int * ALIGNED(64))a07)[15] = b07.i[15]; - - ((int * ALIGNED(64))a08)[ 0] = b08.i[ 0]; - ((int * ALIGNED(64))a08)[ 1] = b09.i[ 0]; - ((int * ALIGNED(64))a08)[ 2] = b10.i[ 0]; - ((int * ALIGNED(64))a08)[ 3] = b11.i[ 0]; - ((int * ALIGNED(64))a08)[ 4] = b12.i[ 0]; - ((int * ALIGNED(64))a08)[ 5] = b13.i[ 0]; - ((int * ALIGNED(64))a08)[ 6] = b14.i[ 0]; - ((int * ALIGNED(64))a08)[ 7] = b15.i[ 0]; - ((int * ALIGNED(64))a08)[ 8] = b08.i[ 1]; - ((int * ALIGNED(64))a08)[ 9] = b09.i[ 1]; - ((int * ALIGNED(64))a08)[10] = b10.i[ 1]; - ((int * ALIGNED(64))a08)[11] = b11.i[ 1]; - ((int * ALIGNED(64))a08)[12] = b12.i[ 1]; - ((int * ALIGNED(64))a08)[13] = b13.i[ 1]; - ((int * ALIGNED(64))a08)[14] = b14.i[ 1]; - ((int * ALIGNED(64))a08)[15] = b15.i[ 1]; - - ((int * ALIGNED(64))a09)[ 0] = b08.i[ 2]; - ((int * ALIGNED(64))a09)[ 1] = b09.i[ 2]; - ((int * ALIGNED(64))a09)[ 2] = b10.i[ 2]; - ((int * ALIGNED(64))a09)[ 3] = b11.i[ 2]; - ((int * ALIGNED(64))a09)[ 4] = b12.i[ 2]; - ((int * ALIGNED(64))a09)[ 5] = b13.i[ 2]; - ((int * ALIGNED(64))a09)[ 6] = b14.i[ 2]; - ((int * ALIGNED(64))a09)[ 7] = b15.i[ 2]; - ((int * ALIGNED(64))a09)[ 8] = b08.i[ 3]; - ((int * ALIGNED(64))a09)[ 9] = b09.i[ 3]; - ((int * ALIGNED(64))a09)[10] = b10.i[ 3]; - ((int * ALIGNED(64))a09)[11] = b11.i[ 3]; - ((int * ALIGNED(64))a09)[12] = b12.i[ 3]; - ((int * ALIGNED(64))a09)[13] = b13.i[ 3]; - ((int * ALIGNED(64))a09)[14] = b14.i[ 3]; - ((int * ALIGNED(64))a09)[15] = b15.i[ 3]; - - ((int * ALIGNED(64))a10)[ 0] = b08.i[ 4]; - ((int * ALIGNED(64))a10)[ 1] = b09.i[ 4]; - ((int * ALIGNED(64))a10)[ 2] = b10.i[ 4]; - ((int * ALIGNED(64))a10)[ 3] = b11.i[ 4]; - ((int * ALIGNED(64))a10)[ 4] = b12.i[ 4]; - ((int * ALIGNED(64))a10)[ 5] = b13.i[ 4]; - ((int * ALIGNED(64))a10)[ 6] = b14.i[ 4]; - ((int * ALIGNED(64))a10)[ 7] = b15.i[ 4]; - ((int * ALIGNED(64))a10)[ 8] = b08.i[ 5]; - ((int * ALIGNED(64))a10)[ 9] = b09.i[ 5]; - ((int * ALIGNED(64))a10)[10] = b10.i[ 5]; - ((int * ALIGNED(64))a10)[11] = b11.i[ 5]; - ((int * ALIGNED(64))a10)[12] = b12.i[ 5]; - ((int * ALIGNED(64))a10)[13] = b13.i[ 5]; - ((int * ALIGNED(64))a10)[14] = b14.i[ 5]; - ((int * ALIGNED(64))a10)[15] = b15.i[ 5]; - - ((int * ALIGNED(64))a11)[ 0] = b08.i[ 6]; - ((int * ALIGNED(64))a11)[ 1] = b09.i[ 6]; - ((int * ALIGNED(64))a11)[ 2] = b10.i[ 6]; - ((int * ALIGNED(64))a11)[ 3] = b11.i[ 6]; - ((int * ALIGNED(64))a11)[ 4] = b12.i[ 6]; - ((int * ALIGNED(64))a11)[ 5] = b13.i[ 6]; - ((int * ALIGNED(64))a11)[ 6] = b14.i[ 6]; - ((int * ALIGNED(64))a11)[ 7] = b15.i[ 6]; - ((int * ALIGNED(64))a11)[ 8] = b08.i[ 7]; - ((int * ALIGNED(64))a11)[ 9] = b09.i[ 7]; - ((int * ALIGNED(64))a11)[10] = b10.i[ 7]; - ((int * ALIGNED(64))a11)[11] = b11.i[ 7]; - ((int * ALIGNED(64))a11)[12] = b12.i[ 7]; - ((int * ALIGNED(64))a11)[13] = b13.i[ 7]; - ((int * ALIGNED(64))a11)[14] = b14.i[ 7]; - ((int * ALIGNED(64))a11)[15] = b15.i[ 7]; - - ((int * ALIGNED(64))a12)[ 0] = b08.i[ 8]; - ((int * ALIGNED(64))a12)[ 1] = b09.i[ 8]; - ((int * ALIGNED(64))a12)[ 2] = b10.i[ 8]; - ((int * ALIGNED(64))a12)[ 3] = b11.i[ 8]; - ((int * ALIGNED(64))a12)[ 4] = b12.i[ 8]; - ((int * ALIGNED(64))a12)[ 5] = b13.i[ 8]; - ((int * ALIGNED(64))a12)[ 6] = b14.i[ 8]; - ((int * ALIGNED(64))a12)[ 7] = b15.i[ 8]; - ((int * ALIGNED(64))a12)[ 8] = b08.i[ 9]; - ((int * ALIGNED(64))a12)[ 9] = b09.i[ 9]; - ((int * ALIGNED(64))a12)[10] = b10.i[ 9]; - ((int * ALIGNED(64))a12)[11] = b11.i[ 9]; - ((int * ALIGNED(64))a12)[12] = b12.i[ 9]; - ((int * ALIGNED(64))a12)[13] = b13.i[ 9]; - ((int * ALIGNED(64))a12)[14] = b14.i[ 9]; - ((int * ALIGNED(64))a12)[15] = b15.i[ 9]; - - ((int * ALIGNED(64))a13)[ 0] = b08.i[10]; - ((int * ALIGNED(64))a13)[ 1] = b09.i[10]; - ((int * ALIGNED(64))a13)[ 2] = b10.i[10]; - ((int * ALIGNED(64))a13)[ 3] = b11.i[10]; - ((int * ALIGNED(64))a13)[ 4] = b12.i[10]; - ((int * ALIGNED(64))a13)[ 5] = b13.i[10]; - ((int * ALIGNED(64))a13)[ 6] = b14.i[10]; - ((int * ALIGNED(64))a13)[ 7] = b15.i[10]; - ((int * ALIGNED(64))a13)[ 8] = b08.i[11]; - ((int * ALIGNED(64))a13)[ 9] = b09.i[11]; - ((int * ALIGNED(64))a13)[10] = b10.i[11]; - ((int * ALIGNED(64))a13)[11] = b11.i[11]; - ((int * ALIGNED(64))a13)[12] = b12.i[11]; - ((int * ALIGNED(64))a13)[13] = b13.i[11]; - ((int * ALIGNED(64))a13)[14] = b14.i[11]; - ((int * ALIGNED(64))a13)[15] = b15.i[11]; - - ((int * ALIGNED(64))a14)[ 0] = b08.i[12]; - ((int * ALIGNED(64))a14)[ 1] = b09.i[12]; - ((int * ALIGNED(64))a14)[ 2] = b10.i[12]; - ((int * ALIGNED(64))a14)[ 3] = b11.i[12]; - ((int * ALIGNED(64))a14)[ 4] = b12.i[12]; - ((int * ALIGNED(64))a14)[ 5] = b13.i[12]; - ((int * ALIGNED(64))a14)[ 6] = b14.i[12]; - ((int * ALIGNED(64))a14)[ 7] = b15.i[12]; - ((int * ALIGNED(64))a14)[ 8] = b08.i[13]; - ((int * ALIGNED(64))a14)[ 9] = b09.i[13]; - ((int * ALIGNED(64))a14)[10] = b10.i[13]; - ((int * ALIGNED(64))a14)[11] = b11.i[13]; - ((int * ALIGNED(64))a14)[12] = b12.i[13]; - ((int * ALIGNED(64))a14)[13] = b13.i[13]; - ((int * ALIGNED(64))a14)[14] = b14.i[13]; - ((int * ALIGNED(64))a14)[15] = b15.i[13]; - - ((int * ALIGNED(64))a15)[ 0] = b08.i[14]; - ((int * ALIGNED(64))a15)[ 1] = b09.i[14]; - ((int * ALIGNED(64))a15)[ 2] = b10.i[14]; - ((int * ALIGNED(64))a15)[ 3] = b11.i[14]; - ((int * ALIGNED(64))a15)[ 4] = b12.i[14]; - ((int * ALIGNED(64))a15)[ 5] = b13.i[14]; - ((int * ALIGNED(64))a15)[ 6] = b14.i[14]; - ((int * ALIGNED(64))a15)[ 7] = b15.i[14]; - ((int * ALIGNED(64))a15)[ 8] = b08.i[15]; - ((int * ALIGNED(64))a15)[ 9] = b09.i[15]; - ((int * ALIGNED(64))a15)[10] = b10.i[15]; - ((int * ALIGNED(64))a15)[11] = b11.i[15]; - ((int * ALIGNED(64))a15)[12] = b12.i[15]; - ((int * ALIGNED(64))a15)[13] = b13.i[15]; - ((int * ALIGNED(64))a15)[14] = b14.i[15]; - ((int * ALIGNED(64))a15)[15] = b15.i[15]; - } - - ////////////// - // v16int class - - class v16int : public v16 - { +} + +// v16 transposed memory manipulation functions + +inline void load_16x1_tr( const void* a00, const void* a01, const void* a02, + const void* a03, const void* a04, const void* a05, + const void* a06, const void* a07, const void* a08, + const void* a09, const void* a10, const void* a11, + const void* a12, const void* a13, const void* a14, + const void* a15, v16& a ) +{ + a.i[0] = ( (const int*)a00 )[0]; + a.i[1] = ( (const int*)a01 )[0]; + a.i[2] = ( (const int*)a02 )[0]; + a.i[3] = ( (const int*)a03 )[0]; + a.i[4] = ( (const int*)a04 )[0]; + a.i[5] = ( (const int*)a05 )[0]; + a.i[6] = ( (const int*)a06 )[0]; + a.i[7] = ( (const int*)a07 )[0]; + a.i[8] = ( (const int*)a08 )[0]; + a.i[9] = ( (const int*)a09 )[0]; + a.i[10] = ( (const int*)a10 )[0]; + a.i[11] = ( (const int*)a11 )[0]; + a.i[12] = ( (const int*)a12 )[0]; + a.i[13] = ( (const int*)a13 )[0]; + a.i[14] = ( (const int*)a14 )[0]; + a.i[15] = ( (const int*)a15 )[0]; +} + +inline void +load_16x2_tr( const void* ALIGNED( 8 ) a00, const void* ALIGNED( 8 ) a01, + const void* ALIGNED( 8 ) a02, const void* ALIGNED( 8 ) a03, + const void* ALIGNED( 8 ) a04, const void* ALIGNED( 8 ) a05, + const void* ALIGNED( 8 ) a06, const void* ALIGNED( 8 ) a07, + const void* ALIGNED( 8 ) a08, const void* ALIGNED( 8 ) a09, + const void* ALIGNED( 8 ) a10, const void* ALIGNED( 8 ) a11, + const void* ALIGNED( 8 ) a12, const void* ALIGNED( 8 ) a13, + const void* ALIGNED( 8 ) a14, const void* ALIGNED( 8 ) a15, + v16& a, v16& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a00 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a01 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a02 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a03 )[1]; + + a.i[4] = ( (const int* ALIGNED( 8 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 8 ))a04 )[1]; + + a.i[5] = ( (const int* ALIGNED( 8 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 8 ))a05 )[1]; + + a.i[6] = ( (const int* ALIGNED( 8 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 8 ))a06 )[1]; + + a.i[7] = ( (const int* ALIGNED( 8 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 8 ))a07 )[1]; + + a.i[8] = ( (const int* ALIGNED( 8 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 8 ))a08 )[1]; + + a.i[9] = ( (const int* ALIGNED( 8 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 8 ))a09 )[1]; + + a.i[10] = ( (const int* ALIGNED( 8 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 8 ))a10 )[1]; + + a.i[11] = ( (const int* ALIGNED( 8 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 8 ))a11 )[1]; + + a.i[12] = ( (const int* ALIGNED( 8 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 8 ))a12 )[1]; + + a.i[13] = ( (const int* ALIGNED( 8 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 8 ))a13 )[1]; + + a.i[14] = ( (const int* ALIGNED( 8 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 8 ))a14 )[1]; + + a.i[15] = ( (const int* ALIGNED( 8 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 8 ))a15 )[1]; +} + +inline void +load_16x3_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; +} + +inline void +load_16x4_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + d.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + d.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + d.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + d.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + d.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + d.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + d.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + d.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + d.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + d.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + d.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + d.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + d.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + d.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + d.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + d.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; +} + +inline void +load_16x8_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& a, v16& b, v16& c, v16& d, v16& e, v16& f, v16& g, v16& h ) +{ + a.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + c.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + d.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + e.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + f.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + g.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + h.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + + a.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + c.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + d.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + e.i[1] = ( (const int* ALIGNED( 64 ))a01 )[4]; + f.i[1] = ( (const int* ALIGNED( 64 ))a01 )[5]; + g.i[1] = ( (const int* ALIGNED( 64 ))a01 )[6]; + h.i[1] = ( (const int* ALIGNED( 64 ))a01 )[7]; + + a.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + c.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + d.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + e.i[2] = ( (const int* ALIGNED( 64 ))a02 )[4]; + f.i[2] = ( (const int* ALIGNED( 64 ))a02 )[5]; + g.i[2] = ( (const int* ALIGNED( 64 ))a02 )[6]; + h.i[2] = ( (const int* ALIGNED( 64 ))a02 )[7]; + + a.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + c.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + d.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + e.i[3] = ( (const int* ALIGNED( 64 ))a03 )[4]; + f.i[3] = ( (const int* ALIGNED( 64 ))a03 )[5]; + g.i[3] = ( (const int* ALIGNED( 64 ))a03 )[6]; + h.i[3] = ( (const int* ALIGNED( 64 ))a03 )[7]; + + a.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + c.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + d.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + e.i[4] = ( (const int* ALIGNED( 64 ))a04 )[4]; + f.i[4] = ( (const int* ALIGNED( 64 ))a04 )[5]; + g.i[4] = ( (const int* ALIGNED( 64 ))a04 )[6]; + h.i[4] = ( (const int* ALIGNED( 64 ))a04 )[7]; + + a.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + c.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + d.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + e.i[5] = ( (const int* ALIGNED( 64 ))a05 )[4]; + f.i[5] = ( (const int* ALIGNED( 64 ))a05 )[5]; + g.i[5] = ( (const int* ALIGNED( 64 ))a05 )[6]; + h.i[5] = ( (const int* ALIGNED( 64 ))a05 )[7]; + + a.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + c.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + d.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + e.i[6] = ( (const int* ALIGNED( 64 ))a06 )[4]; + f.i[6] = ( (const int* ALIGNED( 64 ))a06 )[5]; + g.i[6] = ( (const int* ALIGNED( 64 ))a06 )[6]; + h.i[6] = ( (const int* ALIGNED( 64 ))a06 )[7]; + + a.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + c.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + d.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + e.i[7] = ( (const int* ALIGNED( 64 ))a07 )[4]; + f.i[7] = ( (const int* ALIGNED( 64 ))a07 )[5]; + g.i[7] = ( (const int* ALIGNED( 64 ))a07 )[6]; + h.i[7] = ( (const int* ALIGNED( 64 ))a07 )[7]; + + a.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + c.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + d.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + e.i[8] = ( (const int* ALIGNED( 64 ))a08 )[4]; + f.i[8] = ( (const int* ALIGNED( 64 ))a08 )[5]; + g.i[8] = ( (const int* ALIGNED( 64 ))a08 )[6]; + h.i[8] = ( (const int* ALIGNED( 64 ))a08 )[7]; + + a.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + c.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + d.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + e.i[9] = ( (const int* ALIGNED( 64 ))a09 )[4]; + f.i[9] = ( (const int* ALIGNED( 64 ))a09 )[5]; + g.i[9] = ( (const int* ALIGNED( 64 ))a09 )[6]; + h.i[9] = ( (const int* ALIGNED( 64 ))a09 )[7]; + + a.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + c.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + d.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + e.i[10] = ( (const int* ALIGNED( 64 ))a10 )[4]; + f.i[10] = ( (const int* ALIGNED( 64 ))a10 )[5]; + g.i[10] = ( (const int* ALIGNED( 64 ))a10 )[6]; + h.i[10] = ( (const int* ALIGNED( 64 ))a10 )[7]; + + a.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + c.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + d.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + e.i[11] = ( (const int* ALIGNED( 64 ))a11 )[4]; + f.i[11] = ( (const int* ALIGNED( 64 ))a11 )[5]; + g.i[11] = ( (const int* ALIGNED( 64 ))a11 )[6]; + h.i[11] = ( (const int* ALIGNED( 64 ))a11 )[7]; + + a.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + c.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + d.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + e.i[12] = ( (const int* ALIGNED( 64 ))a12 )[4]; + f.i[12] = ( (const int* ALIGNED( 64 ))a12 )[5]; + g.i[12] = ( (const int* ALIGNED( 64 ))a12 )[6]; + h.i[12] = ( (const int* ALIGNED( 64 ))a12 )[7]; + + a.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + c.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + d.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + e.i[13] = ( (const int* ALIGNED( 64 ))a13 )[4]; + f.i[13] = ( (const int* ALIGNED( 64 ))a13 )[5]; + g.i[13] = ( (const int* ALIGNED( 64 ))a13 )[6]; + h.i[13] = ( (const int* ALIGNED( 64 ))a13 )[7]; + + a.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + c.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + d.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + e.i[14] = ( (const int* ALIGNED( 64 ))a14 )[4]; + f.i[14] = ( (const int* ALIGNED( 64 ))a14 )[5]; + g.i[14] = ( (const int* ALIGNED( 64 ))a14 )[6]; + h.i[14] = ( (const int* ALIGNED( 64 ))a14 )[7]; + + a.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + c.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + d.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; + e.i[15] = ( (const int* ALIGNED( 64 ))a15 )[4]; + f.i[15] = ( (const int* ALIGNED( 64 ))a15 )[5]; + g.i[15] = ( (const int* ALIGNED( 64 ))a15 )[6]; + h.i[15] = ( (const int* ALIGNED( 64 ))a15 )[7]; +} + +inline void +load_16x16_tr( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b08.i[0] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b09.i[0] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b10.i[0] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b11.i[0] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b12.i[0] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b13.i[0] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b14.i[0] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b15.i[0] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[1] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b08.i[1] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b09.i[1] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b10.i[1] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b11.i[1] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b12.i[1] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b13.i[1] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b14.i[1] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b15.i[1] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b08.i[2] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b09.i[2] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b10.i[2] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b11.i[2] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b12.i[2] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b13.i[2] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b14.i[2] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b15.i[2] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[3] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b08.i[3] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b09.i[3] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b10.i[3] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b11.i[3] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b12.i[3] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b13.i[3] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b14.i[3] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b15.i[3] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b08.i[4] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b09.i[4] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b10.i[4] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b11.i[4] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b12.i[4] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b13.i[4] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b14.i[4] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b15.i[4] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[5] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b08.i[5] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b09.i[5] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b10.i[5] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b11.i[5] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b12.i[5] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b13.i[5] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b14.i[5] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b15.i[5] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b08.i[6] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b09.i[6] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b10.i[6] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b11.i[6] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b12.i[6] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b13.i[6] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b14.i[6] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b15.i[6] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[7] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b08.i[7] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b09.i[7] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b10.i[7] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b11.i[7] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b12.i[7] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b13.i[7] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b14.i[7] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b15.i[7] = ( (const int* ALIGNED( 64 ))a07 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a08 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a08 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a08 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a08 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a08 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a08 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a08 )[7]; + b08.i[8] = ( (const int* ALIGNED( 64 ))a08 )[8]; + b09.i[8] = ( (const int* ALIGNED( 64 ))a08 )[9]; + b10.i[8] = ( (const int* ALIGNED( 64 ))a08 )[10]; + b11.i[8] = ( (const int* ALIGNED( 64 ))a08 )[11]; + b12.i[8] = ( (const int* ALIGNED( 64 ))a08 )[12]; + b13.i[8] = ( (const int* ALIGNED( 64 ))a08 )[13]; + b14.i[8] = ( (const int* ALIGNED( 64 ))a08 )[14]; + b15.i[8] = ( (const int* ALIGNED( 64 ))a08 )[15]; + + b00.i[9] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a09 )[1]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a09 )[2]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a09 )[3]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a09 )[4]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a09 )[5]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a09 )[6]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a09 )[7]; + b08.i[9] = ( (const int* ALIGNED( 64 ))a09 )[8]; + b09.i[9] = ( (const int* ALIGNED( 64 ))a09 )[9]; + b10.i[9] = ( (const int* ALIGNED( 64 ))a09 )[10]; + b11.i[9] = ( (const int* ALIGNED( 64 ))a09 )[11]; + b12.i[9] = ( (const int* ALIGNED( 64 ))a09 )[12]; + b13.i[9] = ( (const int* ALIGNED( 64 ))a09 )[13]; + b14.i[9] = ( (const int* ALIGNED( 64 ))a09 )[14]; + b15.i[9] = ( (const int* ALIGNED( 64 ))a09 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a10 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a10 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a10 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a10 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a10 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a10 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a10 )[7]; + b08.i[10] = ( (const int* ALIGNED( 64 ))a10 )[8]; + b09.i[10] = ( (const int* ALIGNED( 64 ))a10 )[9]; + b10.i[10] = ( (const int* ALIGNED( 64 ))a10 )[10]; + b11.i[10] = ( (const int* ALIGNED( 64 ))a10 )[11]; + b12.i[10] = ( (const int* ALIGNED( 64 ))a10 )[12]; + b13.i[10] = ( (const int* ALIGNED( 64 ))a10 )[13]; + b14.i[10] = ( (const int* ALIGNED( 64 ))a10 )[14]; + b15.i[10] = ( (const int* ALIGNED( 64 ))a10 )[15]; + + b00.i[11] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a11 )[1]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a11 )[2]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a11 )[3]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a11 )[4]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a11 )[5]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a11 )[6]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a11 )[7]; + b08.i[11] = ( (const int* ALIGNED( 64 ))a11 )[8]; + b09.i[11] = ( (const int* ALIGNED( 64 ))a11 )[9]; + b10.i[11] = ( (const int* ALIGNED( 64 ))a11 )[10]; + b11.i[11] = ( (const int* ALIGNED( 64 ))a11 )[11]; + b12.i[11] = ( (const int* ALIGNED( 64 ))a11 )[12]; + b13.i[11] = ( (const int* ALIGNED( 64 ))a11 )[13]; + b14.i[11] = ( (const int* ALIGNED( 64 ))a11 )[14]; + b15.i[11] = ( (const int* ALIGNED( 64 ))a11 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a12 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a12 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a12 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a12 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a12 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a12 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a12 )[7]; + b08.i[12] = ( (const int* ALIGNED( 64 ))a12 )[8]; + b09.i[12] = ( (const int* ALIGNED( 64 ))a12 )[9]; + b10.i[12] = ( (const int* ALIGNED( 64 ))a12 )[10]; + b11.i[12] = ( (const int* ALIGNED( 64 ))a12 )[11]; + b12.i[12] = ( (const int* ALIGNED( 64 ))a12 )[12]; + b13.i[12] = ( (const int* ALIGNED( 64 ))a12 )[13]; + b14.i[12] = ( (const int* ALIGNED( 64 ))a12 )[14]; + b15.i[12] = ( (const int* ALIGNED( 64 ))a12 )[15]; + + b00.i[13] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a13 )[1]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a13 )[2]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a13 )[3]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a13 )[4]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a13 )[5]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a13 )[6]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a13 )[7]; + b08.i[13] = ( (const int* ALIGNED( 64 ))a13 )[8]; + b09.i[13] = ( (const int* ALIGNED( 64 ))a13 )[9]; + b10.i[13] = ( (const int* ALIGNED( 64 ))a13 )[10]; + b11.i[13] = ( (const int* ALIGNED( 64 ))a13 )[11]; + b12.i[13] = ( (const int* ALIGNED( 64 ))a13 )[12]; + b13.i[13] = ( (const int* ALIGNED( 64 ))a13 )[13]; + b14.i[13] = ( (const int* ALIGNED( 64 ))a13 )[14]; + b15.i[13] = ( (const int* ALIGNED( 64 ))a13 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a14 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a14 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a14 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a14 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a14 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a14 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a14 )[7]; + b08.i[14] = ( (const int* ALIGNED( 64 ))a14 )[8]; + b09.i[14] = ( (const int* ALIGNED( 64 ))a14 )[9]; + b10.i[14] = ( (const int* ALIGNED( 64 ))a14 )[10]; + b11.i[14] = ( (const int* ALIGNED( 64 ))a14 )[11]; + b12.i[14] = ( (const int* ALIGNED( 64 ))a14 )[12]; + b13.i[14] = ( (const int* ALIGNED( 64 ))a14 )[13]; + b14.i[14] = ( (const int* ALIGNED( 64 ))a14 )[14]; + b15.i[14] = ( (const int* ALIGNED( 64 ))a14 )[15]; + + b00.i[15] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a15 )[1]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a15 )[2]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a15 )[3]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a15 )[4]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a15 )[5]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a15 )[6]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a15 )[7]; + b08.i[15] = ( (const int* ALIGNED( 64 ))a15 )[8]; + b09.i[15] = ( (const int* ALIGNED( 64 ))a15 )[9]; + b10.i[15] = ( (const int* ALIGNED( 64 ))a15 )[10]; + b11.i[15] = ( (const int* ALIGNED( 64 ))a15 )[11]; + b12.i[15] = ( (const int* ALIGNED( 64 ))a15 )[12]; + b13.i[15] = ( (const int* ALIGNED( 64 ))a15 )[13]; + b14.i[15] = ( (const int* ALIGNED( 64 ))a15 )[14]; + b15.i[15] = ( (const int* ALIGNED( 64 ))a15 )[15]; +} + +inline void +load_16x8_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b00.i[1] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b00.i[3] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b00.i[5] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b00.i[7] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b00.i[9] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b00.i[11] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b00.i[13] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b00.i[15] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a07 )[15]; +} + +inline void +load_16x16_tr_p( const void* ALIGNED( 64 ) a00, const void* ALIGNED( 64 ) a01, + const void* ALIGNED( 64 ) a02, const void* ALIGNED( 64 ) a03, + const void* ALIGNED( 64 ) a04, const void* ALIGNED( 64 ) a05, + const void* ALIGNED( 64 ) a06, const void* ALIGNED( 64 ) a07, + const void* ALIGNED( 64 ) a08, const void* ALIGNED( 64 ) a09, + const void* ALIGNED( 64 ) a10, const void* ALIGNED( 64 ) a11, + const void* ALIGNED( 64 ) a12, const void* ALIGNED( 64 ) a13, + const void* ALIGNED( 64 ) a14, const void* ALIGNED( 64 ) a15, + v16& b00, v16& b01, v16& b02, v16& b03, v16& b04, v16& b05, + v16& b06, v16& b07, v16& b08, v16& b09, v16& b10, v16& b11, + v16& b12, v16& b13, v16& b14, v16& b15 ) +{ + b00.i[0] = ( (const int* ALIGNED( 64 ))a00 )[0]; + b01.i[0] = ( (const int* ALIGNED( 64 ))a00 )[1]; + b02.i[0] = ( (const int* ALIGNED( 64 ))a00 )[2]; + b03.i[0] = ( (const int* ALIGNED( 64 ))a00 )[3]; + b04.i[0] = ( (const int* ALIGNED( 64 ))a00 )[4]; + b05.i[0] = ( (const int* ALIGNED( 64 ))a00 )[5]; + b06.i[0] = ( (const int* ALIGNED( 64 ))a00 )[6]; + b07.i[0] = ( (const int* ALIGNED( 64 ))a00 )[7]; + b00.i[1] = ( (const int* ALIGNED( 64 ))a00 )[8]; + b01.i[1] = ( (const int* ALIGNED( 64 ))a00 )[9]; + b02.i[1] = ( (const int* ALIGNED( 64 ))a00 )[10]; + b03.i[1] = ( (const int* ALIGNED( 64 ))a00 )[11]; + b04.i[1] = ( (const int* ALIGNED( 64 ))a00 )[12]; + b05.i[1] = ( (const int* ALIGNED( 64 ))a00 )[13]; + b06.i[1] = ( (const int* ALIGNED( 64 ))a00 )[14]; + b07.i[1] = ( (const int* ALIGNED( 64 ))a00 )[15]; + + b00.i[2] = ( (const int* ALIGNED( 64 ))a01 )[0]; + b01.i[2] = ( (const int* ALIGNED( 64 ))a01 )[1]; + b02.i[2] = ( (const int* ALIGNED( 64 ))a01 )[2]; + b03.i[2] = ( (const int* ALIGNED( 64 ))a01 )[3]; + b04.i[2] = ( (const int* ALIGNED( 64 ))a01 )[4]; + b05.i[2] = ( (const int* ALIGNED( 64 ))a01 )[5]; + b06.i[2] = ( (const int* ALIGNED( 64 ))a01 )[6]; + b07.i[2] = ( (const int* ALIGNED( 64 ))a01 )[7]; + b00.i[3] = ( (const int* ALIGNED( 64 ))a01 )[8]; + b01.i[3] = ( (const int* ALIGNED( 64 ))a01 )[9]; + b02.i[3] = ( (const int* ALIGNED( 64 ))a01 )[10]; + b03.i[3] = ( (const int* ALIGNED( 64 ))a01 )[11]; + b04.i[3] = ( (const int* ALIGNED( 64 ))a01 )[12]; + b05.i[3] = ( (const int* ALIGNED( 64 ))a01 )[13]; + b06.i[3] = ( (const int* ALIGNED( 64 ))a01 )[14]; + b07.i[3] = ( (const int* ALIGNED( 64 ))a01 )[15]; + + b00.i[4] = ( (const int* ALIGNED( 64 ))a02 )[0]; + b01.i[4] = ( (const int* ALIGNED( 64 ))a02 )[1]; + b02.i[4] = ( (const int* ALIGNED( 64 ))a02 )[2]; + b03.i[4] = ( (const int* ALIGNED( 64 ))a02 )[3]; + b04.i[4] = ( (const int* ALIGNED( 64 ))a02 )[4]; + b05.i[4] = ( (const int* ALIGNED( 64 ))a02 )[5]; + b06.i[4] = ( (const int* ALIGNED( 64 ))a02 )[6]; + b07.i[4] = ( (const int* ALIGNED( 64 ))a02 )[7]; + b00.i[5] = ( (const int* ALIGNED( 64 ))a02 )[8]; + b01.i[5] = ( (const int* ALIGNED( 64 ))a02 )[9]; + b02.i[5] = ( (const int* ALIGNED( 64 ))a02 )[10]; + b03.i[5] = ( (const int* ALIGNED( 64 ))a02 )[11]; + b04.i[5] = ( (const int* ALIGNED( 64 ))a02 )[12]; + b05.i[5] = ( (const int* ALIGNED( 64 ))a02 )[13]; + b06.i[5] = ( (const int* ALIGNED( 64 ))a02 )[14]; + b07.i[5] = ( (const int* ALIGNED( 64 ))a02 )[15]; + + b00.i[6] = ( (const int* ALIGNED( 64 ))a03 )[0]; + b01.i[6] = ( (const int* ALIGNED( 64 ))a03 )[1]; + b02.i[6] = ( (const int* ALIGNED( 64 ))a03 )[2]; + b03.i[6] = ( (const int* ALIGNED( 64 ))a03 )[3]; + b04.i[6] = ( (const int* ALIGNED( 64 ))a03 )[4]; + b05.i[6] = ( (const int* ALIGNED( 64 ))a03 )[5]; + b06.i[6] = ( (const int* ALIGNED( 64 ))a03 )[6]; + b07.i[6] = ( (const int* ALIGNED( 64 ))a03 )[7]; + b00.i[7] = ( (const int* ALIGNED( 64 ))a03 )[8]; + b01.i[7] = ( (const int* ALIGNED( 64 ))a03 )[9]; + b02.i[7] = ( (const int* ALIGNED( 64 ))a03 )[10]; + b03.i[7] = ( (const int* ALIGNED( 64 ))a03 )[11]; + b04.i[7] = ( (const int* ALIGNED( 64 ))a03 )[12]; + b05.i[7] = ( (const int* ALIGNED( 64 ))a03 )[13]; + b06.i[7] = ( (const int* ALIGNED( 64 ))a03 )[14]; + b07.i[7] = ( (const int* ALIGNED( 64 ))a03 )[15]; + + b00.i[8] = ( (const int* ALIGNED( 64 ))a04 )[0]; + b01.i[8] = ( (const int* ALIGNED( 64 ))a04 )[1]; + b02.i[8] = ( (const int* ALIGNED( 64 ))a04 )[2]; + b03.i[8] = ( (const int* ALIGNED( 64 ))a04 )[3]; + b04.i[8] = ( (const int* ALIGNED( 64 ))a04 )[4]; + b05.i[8] = ( (const int* ALIGNED( 64 ))a04 )[5]; + b06.i[8] = ( (const int* ALIGNED( 64 ))a04 )[6]; + b07.i[8] = ( (const int* ALIGNED( 64 ))a04 )[7]; + b00.i[9] = ( (const int* ALIGNED( 64 ))a04 )[8]; + b01.i[9] = ( (const int* ALIGNED( 64 ))a04 )[9]; + b02.i[9] = ( (const int* ALIGNED( 64 ))a04 )[10]; + b03.i[9] = ( (const int* ALIGNED( 64 ))a04 )[11]; + b04.i[9] = ( (const int* ALIGNED( 64 ))a04 )[12]; + b05.i[9] = ( (const int* ALIGNED( 64 ))a04 )[13]; + b06.i[9] = ( (const int* ALIGNED( 64 ))a04 )[14]; + b07.i[9] = ( (const int* ALIGNED( 64 ))a04 )[15]; + + b00.i[10] = ( (const int* ALIGNED( 64 ))a05 )[0]; + b01.i[10] = ( (const int* ALIGNED( 64 ))a05 )[1]; + b02.i[10] = ( (const int* ALIGNED( 64 ))a05 )[2]; + b03.i[10] = ( (const int* ALIGNED( 64 ))a05 )[3]; + b04.i[10] = ( (const int* ALIGNED( 64 ))a05 )[4]; + b05.i[10] = ( (const int* ALIGNED( 64 ))a05 )[5]; + b06.i[10] = ( (const int* ALIGNED( 64 ))a05 )[6]; + b07.i[10] = ( (const int* ALIGNED( 64 ))a05 )[7]; + b00.i[11] = ( (const int* ALIGNED( 64 ))a05 )[8]; + b01.i[11] = ( (const int* ALIGNED( 64 ))a05 )[9]; + b02.i[11] = ( (const int* ALIGNED( 64 ))a05 )[10]; + b03.i[11] = ( (const int* ALIGNED( 64 ))a05 )[11]; + b04.i[11] = ( (const int* ALIGNED( 64 ))a05 )[12]; + b05.i[11] = ( (const int* ALIGNED( 64 ))a05 )[13]; + b06.i[11] = ( (const int* ALIGNED( 64 ))a05 )[14]; + b07.i[11] = ( (const int* ALIGNED( 64 ))a05 )[15]; + + b00.i[12] = ( (const int* ALIGNED( 64 ))a06 )[0]; + b01.i[12] = ( (const int* ALIGNED( 64 ))a06 )[1]; + b02.i[12] = ( (const int* ALIGNED( 64 ))a06 )[2]; + b03.i[12] = ( (const int* ALIGNED( 64 ))a06 )[3]; + b04.i[12] = ( (const int* ALIGNED( 64 ))a06 )[4]; + b05.i[12] = ( (const int* ALIGNED( 64 ))a06 )[5]; + b06.i[12] = ( (const int* ALIGNED( 64 ))a06 )[6]; + b07.i[12] = ( (const int* ALIGNED( 64 ))a06 )[7]; + b00.i[13] = ( (const int* ALIGNED( 64 ))a06 )[8]; + b01.i[13] = ( (const int* ALIGNED( 64 ))a06 )[9]; + b02.i[13] = ( (const int* ALIGNED( 64 ))a06 )[10]; + b03.i[13] = ( (const int* ALIGNED( 64 ))a06 )[11]; + b04.i[13] = ( (const int* ALIGNED( 64 ))a06 )[12]; + b05.i[13] = ( (const int* ALIGNED( 64 ))a06 )[13]; + b06.i[13] = ( (const int* ALIGNED( 64 ))a06 )[14]; + b07.i[13] = ( (const int* ALIGNED( 64 ))a06 )[15]; + + b00.i[14] = ( (const int* ALIGNED( 64 ))a07 )[0]; + b01.i[14] = ( (const int* ALIGNED( 64 ))a07 )[1]; + b02.i[14] = ( (const int* ALIGNED( 64 ))a07 )[2]; + b03.i[14] = ( (const int* ALIGNED( 64 ))a07 )[3]; + b04.i[14] = ( (const int* ALIGNED( 64 ))a07 )[4]; + b05.i[14] = ( (const int* ALIGNED( 64 ))a07 )[5]; + b06.i[14] = ( (const int* ALIGNED( 64 ))a07 )[6]; + b07.i[14] = ( (const int* ALIGNED( 64 ))a07 )[7]; + b00.i[15] = ( (const int* ALIGNED( 64 ))a07 )[8]; + b01.i[15] = ( (const int* ALIGNED( 64 ))a07 )[9]; + b02.i[15] = ( (const int* ALIGNED( 64 ))a07 )[10]; + b03.i[15] = ( (const int* ALIGNED( 64 ))a07 )[11]; + b04.i[15] = ( (const int* ALIGNED( 64 ))a07 )[12]; + b05.i[15] = ( (const int* ALIGNED( 64 ))a07 )[13]; + b06.i[15] = ( (const int* ALIGNED( 64 ))a07 )[14]; + b07.i[15] = ( (const int* ALIGNED( 64 ))a07 )[15]; + + b08.i[0] = ( (const int* ALIGNED( 64 ))a08 )[0]; + b09.i[0] = ( (const int* ALIGNED( 64 ))a08 )[1]; + b10.i[0] = ( (const int* ALIGNED( 64 ))a08 )[2]; + b11.i[0] = ( (const int* ALIGNED( 64 ))a08 )[3]; + b12.i[0] = ( (const int* ALIGNED( 64 ))a08 )[4]; + b13.i[0] = ( (const int* ALIGNED( 64 ))a08 )[5]; + b14.i[0] = ( (const int* ALIGNED( 64 ))a08 )[6]; + b15.i[0] = ( (const int* ALIGNED( 64 ))a08 )[7]; + b08.i[1] = ( (const int* ALIGNED( 64 ))a08 )[8]; + b09.i[1] = ( (const int* ALIGNED( 64 ))a08 )[9]; + b10.i[1] = ( (const int* ALIGNED( 64 ))a08 )[10]; + b11.i[1] = ( (const int* ALIGNED( 64 ))a08 )[11]; + b12.i[1] = ( (const int* ALIGNED( 64 ))a08 )[12]; + b13.i[1] = ( (const int* ALIGNED( 64 ))a08 )[13]; + b14.i[1] = ( (const int* ALIGNED( 64 ))a08 )[14]; + b15.i[1] = ( (const int* ALIGNED( 64 ))a08 )[15]; + + b08.i[2] = ( (const int* ALIGNED( 64 ))a09 )[0]; + b09.i[2] = ( (const int* ALIGNED( 64 ))a09 )[1]; + b10.i[2] = ( (const int* ALIGNED( 64 ))a09 )[2]; + b11.i[2] = ( (const int* ALIGNED( 64 ))a09 )[3]; + b12.i[2] = ( (const int* ALIGNED( 64 ))a09 )[4]; + b13.i[2] = ( (const int* ALIGNED( 64 ))a09 )[5]; + b14.i[2] = ( (const int* ALIGNED( 64 ))a09 )[6]; + b15.i[2] = ( (const int* ALIGNED( 64 ))a09 )[7]; + b08.i[3] = ( (const int* ALIGNED( 64 ))a09 )[8]; + b09.i[3] = ( (const int* ALIGNED( 64 ))a09 )[9]; + b10.i[3] = ( (const int* ALIGNED( 64 ))a09 )[10]; + b11.i[3] = ( (const int* ALIGNED( 64 ))a09 )[11]; + b12.i[3] = ( (const int* ALIGNED( 64 ))a09 )[12]; + b13.i[3] = ( (const int* ALIGNED( 64 ))a09 )[13]; + b14.i[3] = ( (const int* ALIGNED( 64 ))a09 )[14]; + b15.i[3] = ( (const int* ALIGNED( 64 ))a09 )[15]; + + b08.i[4] = ( (const int* ALIGNED( 64 ))a10 )[0]; + b09.i[4] = ( (const int* ALIGNED( 64 ))a10 )[1]; + b10.i[4] = ( (const int* ALIGNED( 64 ))a10 )[2]; + b11.i[4] = ( (const int* ALIGNED( 64 ))a10 )[3]; + b12.i[4] = ( (const int* ALIGNED( 64 ))a10 )[4]; + b13.i[4] = ( (const int* ALIGNED( 64 ))a10 )[5]; + b14.i[4] = ( (const int* ALIGNED( 64 ))a10 )[6]; + b15.i[4] = ( (const int* ALIGNED( 64 ))a10 )[7]; + b08.i[5] = ( (const int* ALIGNED( 64 ))a10 )[8]; + b09.i[5] = ( (const int* ALIGNED( 64 ))a10 )[9]; + b10.i[5] = ( (const int* ALIGNED( 64 ))a10 )[10]; + b11.i[5] = ( (const int* ALIGNED( 64 ))a10 )[11]; + b12.i[5] = ( (const int* ALIGNED( 64 ))a10 )[12]; + b13.i[5] = ( (const int* ALIGNED( 64 ))a10 )[13]; + b14.i[5] = ( (const int* ALIGNED( 64 ))a10 )[14]; + b15.i[5] = ( (const int* ALIGNED( 64 ))a10 )[15]; + + b08.i[6] = ( (const int* ALIGNED( 64 ))a11 )[0]; + b09.i[6] = ( (const int* ALIGNED( 64 ))a11 )[1]; + b10.i[6] = ( (const int* ALIGNED( 64 ))a11 )[2]; + b11.i[6] = ( (const int* ALIGNED( 64 ))a11 )[3]; + b12.i[6] = ( (const int* ALIGNED( 64 ))a11 )[4]; + b13.i[6] = ( (const int* ALIGNED( 64 ))a11 )[5]; + b14.i[6] = ( (const int* ALIGNED( 64 ))a11 )[6]; + b15.i[6] = ( (const int* ALIGNED( 64 ))a11 )[7]; + b08.i[7] = ( (const int* ALIGNED( 64 ))a11 )[8]; + b09.i[7] = ( (const int* ALIGNED( 64 ))a11 )[9]; + b10.i[7] = ( (const int* ALIGNED( 64 ))a11 )[10]; + b11.i[7] = ( (const int* ALIGNED( 64 ))a11 )[11]; + b12.i[7] = ( (const int* ALIGNED( 64 ))a11 )[12]; + b13.i[7] = ( (const int* ALIGNED( 64 ))a11 )[13]; + b14.i[7] = ( (const int* ALIGNED( 64 ))a11 )[14]; + b15.i[7] = ( (const int* ALIGNED( 64 ))a11 )[15]; + + b08.i[8] = ( (const int* ALIGNED( 64 ))a12 )[0]; + b09.i[8] = ( (const int* ALIGNED( 64 ))a12 )[1]; + b10.i[8] = ( (const int* ALIGNED( 64 ))a12 )[2]; + b11.i[8] = ( (const int* ALIGNED( 64 ))a12 )[3]; + b12.i[8] = ( (const int* ALIGNED( 64 ))a12 )[4]; + b13.i[8] = ( (const int* ALIGNED( 64 ))a12 )[5]; + b14.i[8] = ( (const int* ALIGNED( 64 ))a12 )[6]; + b15.i[8] = ( (const int* ALIGNED( 64 ))a12 )[7]; + b08.i[9] = ( (const int* ALIGNED( 64 ))a12 )[8]; + b09.i[9] = ( (const int* ALIGNED( 64 ))a12 )[9]; + b10.i[9] = ( (const int* ALIGNED( 64 ))a12 )[10]; + b11.i[9] = ( (const int* ALIGNED( 64 ))a12 )[11]; + b12.i[9] = ( (const int* ALIGNED( 64 ))a12 )[12]; + b13.i[9] = ( (const int* ALIGNED( 64 ))a12 )[13]; + b14.i[9] = ( (const int* ALIGNED( 64 ))a12 )[14]; + b15.i[9] = ( (const int* ALIGNED( 64 ))a12 )[15]; + + b08.i[10] = ( (const int* ALIGNED( 64 ))a13 )[0]; + b09.i[10] = ( (const int* ALIGNED( 64 ))a13 )[1]; + b10.i[10] = ( (const int* ALIGNED( 64 ))a13 )[2]; + b11.i[10] = ( (const int* ALIGNED( 64 ))a13 )[3]; + b12.i[10] = ( (const int* ALIGNED( 64 ))a13 )[4]; + b13.i[10] = ( (const int* ALIGNED( 64 ))a13 )[5]; + b14.i[10] = ( (const int* ALIGNED( 64 ))a13 )[6]; + b15.i[10] = ( (const int* ALIGNED( 64 ))a13 )[7]; + b08.i[11] = ( (const int* ALIGNED( 64 ))a13 )[8]; + b09.i[11] = ( (const int* ALIGNED( 64 ))a13 )[9]; + b10.i[11] = ( (const int* ALIGNED( 64 ))a13 )[10]; + b11.i[11] = ( (const int* ALIGNED( 64 ))a13 )[11]; + b12.i[11] = ( (const int* ALIGNED( 64 ))a13 )[12]; + b13.i[11] = ( (const int* ALIGNED( 64 ))a13 )[13]; + b14.i[11] = ( (const int* ALIGNED( 64 ))a13 )[14]; + b15.i[11] = ( (const int* ALIGNED( 64 ))a13 )[15]; + + b08.i[12] = ( (const int* ALIGNED( 64 ))a14 )[0]; + b09.i[12] = ( (const int* ALIGNED( 64 ))a14 )[1]; + b10.i[12] = ( (const int* ALIGNED( 64 ))a14 )[2]; + b11.i[12] = ( (const int* ALIGNED( 64 ))a14 )[3]; + b12.i[12] = ( (const int* ALIGNED( 64 ))a14 )[4]; + b13.i[12] = ( (const int* ALIGNED( 64 ))a14 )[5]; + b14.i[12] = ( (const int* ALIGNED( 64 ))a14 )[6]; + b15.i[12] = ( (const int* ALIGNED( 64 ))a14 )[7]; + b08.i[13] = ( (const int* ALIGNED( 64 ))a14 )[8]; + b09.i[13] = ( (const int* ALIGNED( 64 ))a14 )[9]; + b10.i[13] = ( (const int* ALIGNED( 64 ))a14 )[10]; + b11.i[13] = ( (const int* ALIGNED( 64 ))a14 )[11]; + b12.i[13] = ( (const int* ALIGNED( 64 ))a14 )[12]; + b13.i[13] = ( (const int* ALIGNED( 64 ))a14 )[13]; + b14.i[13] = ( (const int* ALIGNED( 64 ))a14 )[14]; + b15.i[13] = ( (const int* ALIGNED( 64 ))a14 )[15]; + + b08.i[14] = ( (const int* ALIGNED( 64 ))a15 )[0]; + b09.i[14] = ( (const int* ALIGNED( 64 ))a15 )[1]; + b10.i[14] = ( (const int* ALIGNED( 64 ))a15 )[2]; + b11.i[14] = ( (const int* ALIGNED( 64 ))a15 )[3]; + b12.i[14] = ( (const int* ALIGNED( 64 ))a15 )[4]; + b13.i[14] = ( (const int* ALIGNED( 64 ))a15 )[5]; + b14.i[14] = ( (const int* ALIGNED( 64 ))a15 )[6]; + b15.i[14] = ( (const int* ALIGNED( 64 ))a15 )[7]; + b08.i[15] = ( (const int* ALIGNED( 64 ))a15 )[8]; + b09.i[15] = ( (const int* ALIGNED( 64 ))a15 )[9]; + b10.i[15] = ( (const int* ALIGNED( 64 ))a15 )[10]; + b11.i[15] = ( (const int* ALIGNED( 64 ))a15 )[11]; + b12.i[15] = ( (const int* ALIGNED( 64 ))a15 )[12]; + b13.i[15] = ( (const int* ALIGNED( 64 ))a15 )[13]; + b14.i[15] = ( (const int* ALIGNED( 64 ))a15 )[14]; + b15.i[15] = ( (const int* ALIGNED( 64 ))a15 )[15]; +} + +inline void store_16x1_tr( const v16& a, void* a00, void* a01, void* a02, + void* a03, void* a04, void* a05, void* a06, + void* a07, void* a08, void* a09, void* a10, + void* a11, void* a12, void* a13, void* a14, + void* a15 ) +{ + ( (int*)a00 )[0] = a.i[0]; + ( (int*)a01 )[0] = a.i[1]; + ( (int*)a02 )[0] = a.i[2]; + ( (int*)a03 )[0] = a.i[3]; + ( (int*)a04 )[0] = a.i[4]; + ( (int*)a05 )[0] = a.i[5]; + ( (int*)a06 )[0] = a.i[6]; + ( (int*)a07 )[0] = a.i[7]; + ( (int*)a08 )[0] = a.i[8]; + ( (int*)a09 )[0] = a.i[9]; + ( (int*)a10 )[0] = a.i[10]; + ( (int*)a11 )[0] = a.i[11]; + ( (int*)a12 )[0] = a.i[12]; + ( (int*)a13 )[0] = a.i[13]; + ( (int*)a14 )[0] = a.i[14]; + ( (int*)a15 )[0] = a.i[15]; +} + +inline void store_16x2_tr( const v16& a, const v16& b, void* ALIGNED( 8 ) a00, + void* ALIGNED( 8 ) a01, void* ALIGNED( 8 ) a02, + void* ALIGNED( 8 ) a03, void* ALIGNED( 8 ) a04, + void* ALIGNED( 8 ) a05, void* ALIGNED( 8 ) a06, + void* ALIGNED( 8 ) a07, void* ALIGNED( 8 ) a08, + void* ALIGNED( 8 ) a09, void* ALIGNED( 8 ) a10, + void* ALIGNED( 8 ) a11, void* ALIGNED( 8 ) a12, + void* ALIGNED( 8 ) a13, void* ALIGNED( 8 ) a14, + void* ALIGNED( 8 ) a15 ) +{ + ( (int* ALIGNED( 8 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a00 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a01 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a02 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a03 )[1] = b.i[3]; + + ( (int* ALIGNED( 8 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a04 )[1] = b.i[4]; + + ( (int* ALIGNED( 8 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a05 )[1] = b.i[5]; + + ( (int* ALIGNED( 8 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a06 )[1] = b.i[6]; + + ( (int* ALIGNED( 8 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a07 )[1] = b.i[7]; + + ( (int* ALIGNED( 8 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 8 ))a08 )[1] = b.i[8]; + + ( (int* ALIGNED( 8 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 8 ))a09 )[1] = b.i[9]; + + ( (int* ALIGNED( 8 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 8 ))a10 )[1] = b.i[10]; + + ( (int* ALIGNED( 8 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 8 ))a11 )[1] = b.i[11]; + + ( (int* ALIGNED( 8 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 8 ))a12 )[1] = b.i[12]; + + ( (int* ALIGNED( 8 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 8 ))a13 )[1] = b.i[13]; + + ( (int* ALIGNED( 8 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 8 ))a14 )[1] = b.i[14]; + + ( (int* ALIGNED( 8 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 8 ))a15 )[1] = b.i[15]; +} + +inline void store_16x3_tr( const v16& a, const v16& b, const v16& c, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, + void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, + void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; +} + +inline void store_16x4_tr( const v16& a, const v16& b, const v16& c, + const v16& d, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, + void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, + void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; +} + +inline void store_16x8_tr( + const v16& a, const v16& b, const v16& c, const v16& d, const v16& e, + const v16& f, const v16& g, const v16& h, void* ALIGNED( 64 ) a00, + void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, void* ALIGNED( 64 ) a06, + void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, void* ALIGNED( 64 ) a09, + void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, void* ALIGNED( 64 ) a12, + void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = a.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = c.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = d.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = e.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = f.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = g.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = h.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = a.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = c.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = d.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = e.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = f.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = g.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = h.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = a.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = c.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = d.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = e.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = f.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = g.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = h.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = a.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = c.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = d.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = e.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = f.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = g.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = h.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = a.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = c.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = d.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = e.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = f.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = g.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = h.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = a.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = c.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = d.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = e.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = f.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = g.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = h.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = a.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = c.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = d.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = e.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = f.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = g.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = h.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = a.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = c.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = d.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = e.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = f.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = g.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = h.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = a.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = c.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = d.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = e.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = f.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = g.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = h.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = a.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = c.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = d.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = e.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = f.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = g.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = h.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = a.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = c.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = d.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = e.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = f.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = g.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = h.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = a.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = c.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = d.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = e.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = f.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = g.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = h.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = a.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = c.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = d.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = e.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = f.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = g.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = h.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = a.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = c.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = d.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = e.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = f.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = g.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = h.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = a.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = c.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = d.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = e.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = f.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = g.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = h.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = a.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = c.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = d.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = e.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = f.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = g.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = h.i[15]; +} + +inline void store_16x16_tr( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b08.i[0]; + ( (int* ALIGNED( 64 ))a00 )[9] = b09.i[0]; + ( (int* ALIGNED( 64 ))a00 )[10] = b10.i[0]; + ( (int* ALIGNED( 64 ))a00 )[11] = b11.i[0]; + ( (int* ALIGNED( 64 ))a00 )[12] = b12.i[0]; + ( (int* ALIGNED( 64 ))a00 )[13] = b13.i[0]; + ( (int* ALIGNED( 64 ))a00 )[14] = b14.i[0]; + ( (int* ALIGNED( 64 ))a00 )[15] = b15.i[0]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[1]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[1]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[1]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[1]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[1]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[1]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[1]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[1]; + ( (int* ALIGNED( 64 ))a01 )[8] = b08.i[1]; + ( (int* ALIGNED( 64 ))a01 )[9] = b09.i[1]; + ( (int* ALIGNED( 64 ))a01 )[10] = b10.i[1]; + ( (int* ALIGNED( 64 ))a01 )[11] = b11.i[1]; + ( (int* ALIGNED( 64 ))a01 )[12] = b12.i[1]; + ( (int* ALIGNED( 64 ))a01 )[13] = b13.i[1]; + ( (int* ALIGNED( 64 ))a01 )[14] = b14.i[1]; + ( (int* ALIGNED( 64 ))a01 )[15] = b15.i[1]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a02 )[8] = b08.i[2]; + ( (int* ALIGNED( 64 ))a02 )[9] = b09.i[2]; + ( (int* ALIGNED( 64 ))a02 )[10] = b10.i[2]; + ( (int* ALIGNED( 64 ))a02 )[11] = b11.i[2]; + ( (int* ALIGNED( 64 ))a02 )[12] = b12.i[2]; + ( (int* ALIGNED( 64 ))a02 )[13] = b13.i[2]; + ( (int* ALIGNED( 64 ))a02 )[14] = b14.i[2]; + ( (int* ALIGNED( 64 ))a02 )[15] = b15.i[2]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[3]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[3]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[3]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[3]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[3]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[3]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[3]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[3]; + ( (int* ALIGNED( 64 ))a03 )[8] = b08.i[3]; + ( (int* ALIGNED( 64 ))a03 )[9] = b09.i[3]; + ( (int* ALIGNED( 64 ))a03 )[10] = b10.i[3]; + ( (int* ALIGNED( 64 ))a03 )[11] = b11.i[3]; + ( (int* ALIGNED( 64 ))a03 )[12] = b12.i[3]; + ( (int* ALIGNED( 64 ))a03 )[13] = b13.i[3]; + ( (int* ALIGNED( 64 ))a03 )[14] = b14.i[3]; + ( (int* ALIGNED( 64 ))a03 )[15] = b15.i[3]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a04 )[8] = b08.i[4]; + ( (int* ALIGNED( 64 ))a04 )[9] = b09.i[4]; + ( (int* ALIGNED( 64 ))a04 )[10] = b10.i[4]; + ( (int* ALIGNED( 64 ))a04 )[11] = b11.i[4]; + ( (int* ALIGNED( 64 ))a04 )[12] = b12.i[4]; + ( (int* ALIGNED( 64 ))a04 )[13] = b13.i[4]; + ( (int* ALIGNED( 64 ))a04 )[14] = b14.i[4]; + ( (int* ALIGNED( 64 ))a04 )[15] = b15.i[4]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[5]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[5]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[5]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[5]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[5]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[5]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[5]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[5]; + ( (int* ALIGNED( 64 ))a05 )[8] = b08.i[5]; + ( (int* ALIGNED( 64 ))a05 )[9] = b09.i[5]; + ( (int* ALIGNED( 64 ))a05 )[10] = b10.i[5]; + ( (int* ALIGNED( 64 ))a05 )[11] = b11.i[5]; + ( (int* ALIGNED( 64 ))a05 )[12] = b12.i[5]; + ( (int* ALIGNED( 64 ))a05 )[13] = b13.i[5]; + ( (int* ALIGNED( 64 ))a05 )[14] = b14.i[5]; + ( (int* ALIGNED( 64 ))a05 )[15] = b15.i[5]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a06 )[8] = b08.i[6]; + ( (int* ALIGNED( 64 ))a06 )[9] = b09.i[6]; + ( (int* ALIGNED( 64 ))a06 )[10] = b10.i[6]; + ( (int* ALIGNED( 64 ))a06 )[11] = b11.i[6]; + ( (int* ALIGNED( 64 ))a06 )[12] = b12.i[6]; + ( (int* ALIGNED( 64 ))a06 )[13] = b13.i[6]; + ( (int* ALIGNED( 64 ))a06 )[14] = b14.i[6]; + ( (int* ALIGNED( 64 ))a06 )[15] = b15.i[6]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[7]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[7]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[7]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[7]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[7]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[7]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[7]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[7]; + ( (int* ALIGNED( 64 ))a07 )[8] = b08.i[7]; + ( (int* ALIGNED( 64 ))a07 )[9] = b09.i[7]; + ( (int* ALIGNED( 64 ))a07 )[10] = b10.i[7]; + ( (int* ALIGNED( 64 ))a07 )[11] = b11.i[7]; + ( (int* ALIGNED( 64 ))a07 )[12] = b12.i[7]; + ( (int* ALIGNED( 64 ))a07 )[13] = b13.i[7]; + ( (int* ALIGNED( 64 ))a07 )[14] = b14.i[7]; + ( (int* ALIGNED( 64 ))a07 )[15] = b15.i[7]; + + ( (int* ALIGNED( 64 ))a08 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a08 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a08 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a08 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a08 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a08 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a08 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a08 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a08 )[8] = b08.i[8]; + ( (int* ALIGNED( 64 ))a08 )[9] = b09.i[8]; + ( (int* ALIGNED( 64 ))a08 )[10] = b10.i[8]; + ( (int* ALIGNED( 64 ))a08 )[11] = b11.i[8]; + ( (int* ALIGNED( 64 ))a08 )[12] = b12.i[8]; + ( (int* ALIGNED( 64 ))a08 )[13] = b13.i[8]; + ( (int* ALIGNED( 64 ))a08 )[14] = b14.i[8]; + ( (int* ALIGNED( 64 ))a08 )[15] = b15.i[8]; + + ( (int* ALIGNED( 64 ))a09 )[0] = b00.i[9]; + ( (int* ALIGNED( 64 ))a09 )[1] = b01.i[9]; + ( (int* ALIGNED( 64 ))a09 )[2] = b02.i[9]; + ( (int* ALIGNED( 64 ))a09 )[3] = b03.i[9]; + ( (int* ALIGNED( 64 ))a09 )[4] = b04.i[9]; + ( (int* ALIGNED( 64 ))a09 )[5] = b05.i[9]; + ( (int* ALIGNED( 64 ))a09 )[6] = b06.i[9]; + ( (int* ALIGNED( 64 ))a09 )[7] = b07.i[9]; + ( (int* ALIGNED( 64 ))a09 )[8] = b08.i[9]; + ( (int* ALIGNED( 64 ))a09 )[9] = b09.i[9]; + ( (int* ALIGNED( 64 ))a09 )[10] = b10.i[9]; + ( (int* ALIGNED( 64 ))a09 )[11] = b11.i[9]; + ( (int* ALIGNED( 64 ))a09 )[12] = b12.i[9]; + ( (int* ALIGNED( 64 ))a09 )[13] = b13.i[9]; + ( (int* ALIGNED( 64 ))a09 )[14] = b14.i[9]; + ( (int* ALIGNED( 64 ))a09 )[15] = b15.i[9]; + + ( (int* ALIGNED( 64 ))a10 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a10 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a10 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a10 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a10 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a10 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a10 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a10 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a10 )[8] = b08.i[10]; + ( (int* ALIGNED( 64 ))a10 )[9] = b09.i[10]; + ( (int* ALIGNED( 64 ))a10 )[10] = b10.i[10]; + ( (int* ALIGNED( 64 ))a10 )[11] = b11.i[10]; + ( (int* ALIGNED( 64 ))a10 )[12] = b12.i[10]; + ( (int* ALIGNED( 64 ))a10 )[13] = b13.i[10]; + ( (int* ALIGNED( 64 ))a10 )[14] = b14.i[10]; + ( (int* ALIGNED( 64 ))a10 )[15] = b15.i[10]; + + ( (int* ALIGNED( 64 ))a11 )[0] = b00.i[11]; + ( (int* ALIGNED( 64 ))a11 )[1] = b01.i[11]; + ( (int* ALIGNED( 64 ))a11 )[2] = b02.i[11]; + ( (int* ALIGNED( 64 ))a11 )[3] = b03.i[11]; + ( (int* ALIGNED( 64 ))a11 )[4] = b04.i[11]; + ( (int* ALIGNED( 64 ))a11 )[5] = b05.i[11]; + ( (int* ALIGNED( 64 ))a11 )[6] = b06.i[11]; + ( (int* ALIGNED( 64 ))a11 )[7] = b07.i[11]; + ( (int* ALIGNED( 64 ))a11 )[8] = b08.i[11]; + ( (int* ALIGNED( 64 ))a11 )[9] = b09.i[11]; + ( (int* ALIGNED( 64 ))a11 )[10] = b10.i[11]; + ( (int* ALIGNED( 64 ))a11 )[11] = b11.i[11]; + ( (int* ALIGNED( 64 ))a11 )[12] = b12.i[11]; + ( (int* ALIGNED( 64 ))a11 )[13] = b13.i[11]; + ( (int* ALIGNED( 64 ))a11 )[14] = b14.i[11]; + ( (int* ALIGNED( 64 ))a11 )[15] = b15.i[11]; + + ( (int* ALIGNED( 64 ))a12 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a12 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a12 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a12 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a12 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a12 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a12 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a12 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a12 )[8] = b08.i[12]; + ( (int* ALIGNED( 64 ))a12 )[9] = b09.i[12]; + ( (int* ALIGNED( 64 ))a12 )[10] = b10.i[12]; + ( (int* ALIGNED( 64 ))a12 )[11] = b11.i[12]; + ( (int* ALIGNED( 64 ))a12 )[12] = b12.i[12]; + ( (int* ALIGNED( 64 ))a12 )[13] = b13.i[12]; + ( (int* ALIGNED( 64 ))a12 )[14] = b14.i[12]; + ( (int* ALIGNED( 64 ))a12 )[15] = b15.i[12]; + + ( (int* ALIGNED( 64 ))a13 )[0] = b00.i[13]; + ( (int* ALIGNED( 64 ))a13 )[1] = b01.i[13]; + ( (int* ALIGNED( 64 ))a13 )[2] = b02.i[13]; + ( (int* ALIGNED( 64 ))a13 )[3] = b03.i[13]; + ( (int* ALIGNED( 64 ))a13 )[4] = b04.i[13]; + ( (int* ALIGNED( 64 ))a13 )[5] = b05.i[13]; + ( (int* ALIGNED( 64 ))a13 )[6] = b06.i[13]; + ( (int* ALIGNED( 64 ))a13 )[7] = b07.i[13]; + ( (int* ALIGNED( 64 ))a13 )[8] = b08.i[13]; + ( (int* ALIGNED( 64 ))a13 )[9] = b09.i[13]; + ( (int* ALIGNED( 64 ))a13 )[10] = b10.i[13]; + ( (int* ALIGNED( 64 ))a13 )[11] = b11.i[13]; + ( (int* ALIGNED( 64 ))a13 )[12] = b12.i[13]; + ( (int* ALIGNED( 64 ))a13 )[13] = b13.i[13]; + ( (int* ALIGNED( 64 ))a13 )[14] = b14.i[13]; + ( (int* ALIGNED( 64 ))a13 )[15] = b15.i[13]; + + ( (int* ALIGNED( 64 ))a14 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a14 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a14 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a14 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a14 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a14 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a14 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a14 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a14 )[8] = b08.i[14]; + ( (int* ALIGNED( 64 ))a14 )[9] = b09.i[14]; + ( (int* ALIGNED( 64 ))a14 )[10] = b10.i[14]; + ( (int* ALIGNED( 64 ))a14 )[11] = b11.i[14]; + ( (int* ALIGNED( 64 ))a14 )[12] = b12.i[14]; + ( (int* ALIGNED( 64 ))a14 )[13] = b13.i[14]; + ( (int* ALIGNED( 64 ))a14 )[14] = b14.i[14]; + ( (int* ALIGNED( 64 ))a14 )[15] = b15.i[14]; + + ( (int* ALIGNED( 64 ))a15 )[0] = b00.i[15]; + ( (int* ALIGNED( 64 ))a15 )[1] = b01.i[15]; + ( (int* ALIGNED( 64 ))a15 )[2] = b02.i[15]; + ( (int* ALIGNED( 64 ))a15 )[3] = b03.i[15]; + ( (int* ALIGNED( 64 ))a15 )[4] = b04.i[15]; + ( (int* ALIGNED( 64 ))a15 )[5] = b05.i[15]; + ( (int* ALIGNED( 64 ))a15 )[6] = b06.i[15]; + ( (int* ALIGNED( 64 ))a15 )[7] = b07.i[15]; + ( (int* ALIGNED( 64 ))a15 )[8] = b08.i[15]; + ( (int* ALIGNED( 64 ))a15 )[9] = b09.i[15]; + ( (int* ALIGNED( 64 ))a15 )[10] = b10.i[15]; + ( (int* ALIGNED( 64 ))a15 )[11] = b11.i[15]; + ( (int* ALIGNED( 64 ))a15 )[12] = b12.i[15]; + ( (int* ALIGNED( 64 ))a15 )[13] = b13.i[15]; + ( (int* ALIGNED( 64 ))a15 )[14] = b14.i[15]; + ( (int* ALIGNED( 64 ))a15 )[15] = b15.i[15]; +} + +inline void store_16x8_tr_p( const v16& b00, const v16& b01, const v16& b02, + const v16& b03, const v16& b04, const v16& b05, + const v16& b06, const v16& b07, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, + void* ALIGNED( 64 ) a02, void* ALIGNED( 64 ) a03, + void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b00.i[1]; + ( (int* ALIGNED( 64 ))a00 )[9] = b01.i[1]; + ( (int* ALIGNED( 64 ))a00 )[10] = b02.i[1]; + ( (int* ALIGNED( 64 ))a00 )[11] = b03.i[1]; + ( (int* ALIGNED( 64 ))a00 )[12] = b04.i[1]; + ( (int* ALIGNED( 64 ))a00 )[13] = b05.i[1]; + ( (int* ALIGNED( 64 ))a00 )[14] = b06.i[1]; + ( (int* ALIGNED( 64 ))a00 )[15] = b07.i[1]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a01 )[8] = b00.i[3]; + ( (int* ALIGNED( 64 ))a01 )[9] = b01.i[3]; + ( (int* ALIGNED( 64 ))a01 )[10] = b02.i[3]; + ( (int* ALIGNED( 64 ))a01 )[11] = b03.i[3]; + ( (int* ALIGNED( 64 ))a01 )[12] = b04.i[3]; + ( (int* ALIGNED( 64 ))a01 )[13] = b05.i[3]; + ( (int* ALIGNED( 64 ))a01 )[14] = b06.i[3]; + ( (int* ALIGNED( 64 ))a01 )[15] = b07.i[3]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a02 )[8] = b00.i[5]; + ( (int* ALIGNED( 64 ))a02 )[9] = b01.i[5]; + ( (int* ALIGNED( 64 ))a02 )[10] = b02.i[5]; + ( (int* ALIGNED( 64 ))a02 )[11] = b03.i[5]; + ( (int* ALIGNED( 64 ))a02 )[12] = b04.i[5]; + ( (int* ALIGNED( 64 ))a02 )[13] = b05.i[5]; + ( (int* ALIGNED( 64 ))a02 )[14] = b06.i[5]; + ( (int* ALIGNED( 64 ))a02 )[15] = b07.i[5]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a03 )[8] = b00.i[7]; + ( (int* ALIGNED( 64 ))a03 )[9] = b01.i[7]; + ( (int* ALIGNED( 64 ))a03 )[10] = b02.i[7]; + ( (int* ALIGNED( 64 ))a03 )[11] = b03.i[7]; + ( (int* ALIGNED( 64 ))a03 )[12] = b04.i[7]; + ( (int* ALIGNED( 64 ))a03 )[13] = b05.i[7]; + ( (int* ALIGNED( 64 ))a03 )[14] = b06.i[7]; + ( (int* ALIGNED( 64 ))a03 )[15] = b07.i[7]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a04 )[8] = b00.i[9]; + ( (int* ALIGNED( 64 ))a04 )[9] = b01.i[9]; + ( (int* ALIGNED( 64 ))a04 )[10] = b02.i[9]; + ( (int* ALIGNED( 64 ))a04 )[11] = b03.i[9]; + ( (int* ALIGNED( 64 ))a04 )[12] = b04.i[9]; + ( (int* ALIGNED( 64 ))a04 )[13] = b05.i[9]; + ( (int* ALIGNED( 64 ))a04 )[14] = b06.i[9]; + ( (int* ALIGNED( 64 ))a04 )[15] = b07.i[9]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a05 )[8] = b00.i[11]; + ( (int* ALIGNED( 64 ))a05 )[9] = b01.i[11]; + ( (int* ALIGNED( 64 ))a05 )[10] = b02.i[11]; + ( (int* ALIGNED( 64 ))a05 )[11] = b03.i[11]; + ( (int* ALIGNED( 64 ))a05 )[12] = b04.i[11]; + ( (int* ALIGNED( 64 ))a05 )[13] = b05.i[11]; + ( (int* ALIGNED( 64 ))a05 )[14] = b06.i[11]; + ( (int* ALIGNED( 64 ))a05 )[15] = b07.i[11]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a06 )[8] = b00.i[13]; + ( (int* ALIGNED( 64 ))a06 )[9] = b01.i[13]; + ( (int* ALIGNED( 64 ))a06 )[10] = b02.i[13]; + ( (int* ALIGNED( 64 ))a06 )[11] = b03.i[13]; + ( (int* ALIGNED( 64 ))a06 )[12] = b04.i[13]; + ( (int* ALIGNED( 64 ))a06 )[13] = b05.i[13]; + ( (int* ALIGNED( 64 ))a06 )[14] = b06.i[13]; + ( (int* ALIGNED( 64 ))a06 )[15] = b07.i[13]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a07 )[8] = b00.i[15]; + ( (int* ALIGNED( 64 ))a07 )[9] = b01.i[15]; + ( (int* ALIGNED( 64 ))a07 )[10] = b02.i[15]; + ( (int* ALIGNED( 64 ))a07 )[11] = b03.i[15]; + ( (int* ALIGNED( 64 ))a07 )[12] = b04.i[15]; + ( (int* ALIGNED( 64 ))a07 )[13] = b05.i[15]; + ( (int* ALIGNED( 64 ))a07 )[14] = b06.i[15]; + ( (int* ALIGNED( 64 ))a07 )[15] = b07.i[15]; +} + +inline void store_16x16_tr_p( + const v16& b00, const v16& b01, const v16& b02, const v16& b03, + const v16& b04, const v16& b05, const v16& b06, const v16& b07, + const v16& b08, const v16& b09, const v16& b10, const v16& b11, + const v16& b12, const v16& b13, const v16& b14, const v16& b15, + void* ALIGNED( 64 ) a00, void* ALIGNED( 64 ) a01, void* ALIGNED( 64 ) a02, + void* ALIGNED( 64 ) a03, void* ALIGNED( 64 ) a04, void* ALIGNED( 64 ) a05, + void* ALIGNED( 64 ) a06, void* ALIGNED( 64 ) a07, void* ALIGNED( 64 ) a08, + void* ALIGNED( 64 ) a09, void* ALIGNED( 64 ) a10, void* ALIGNED( 64 ) a11, + void* ALIGNED( 64 ) a12, void* ALIGNED( 64 ) a13, void* ALIGNED( 64 ) a14, + void* ALIGNED( 64 ) a15 ) +{ + ( (int* ALIGNED( 64 ))a00 )[0] = b00.i[0]; + ( (int* ALIGNED( 64 ))a00 )[1] = b01.i[0]; + ( (int* ALIGNED( 64 ))a00 )[2] = b02.i[0]; + ( (int* ALIGNED( 64 ))a00 )[3] = b03.i[0]; + ( (int* ALIGNED( 64 ))a00 )[4] = b04.i[0]; + ( (int* ALIGNED( 64 ))a00 )[5] = b05.i[0]; + ( (int* ALIGNED( 64 ))a00 )[6] = b06.i[0]; + ( (int* ALIGNED( 64 ))a00 )[7] = b07.i[0]; + ( (int* ALIGNED( 64 ))a00 )[8] = b00.i[1]; + ( (int* ALIGNED( 64 ))a00 )[9] = b01.i[1]; + ( (int* ALIGNED( 64 ))a00 )[10] = b02.i[1]; + ( (int* ALIGNED( 64 ))a00 )[11] = b03.i[1]; + ( (int* ALIGNED( 64 ))a00 )[12] = b04.i[1]; + ( (int* ALIGNED( 64 ))a00 )[13] = b05.i[1]; + ( (int* ALIGNED( 64 ))a00 )[14] = b06.i[1]; + ( (int* ALIGNED( 64 ))a00 )[15] = b07.i[1]; + + ( (int* ALIGNED( 64 ))a01 )[0] = b00.i[2]; + ( (int* ALIGNED( 64 ))a01 )[1] = b01.i[2]; + ( (int* ALIGNED( 64 ))a01 )[2] = b02.i[2]; + ( (int* ALIGNED( 64 ))a01 )[3] = b03.i[2]; + ( (int* ALIGNED( 64 ))a01 )[4] = b04.i[2]; + ( (int* ALIGNED( 64 ))a01 )[5] = b05.i[2]; + ( (int* ALIGNED( 64 ))a01 )[6] = b06.i[2]; + ( (int* ALIGNED( 64 ))a01 )[7] = b07.i[2]; + ( (int* ALIGNED( 64 ))a01 )[8] = b00.i[3]; + ( (int* ALIGNED( 64 ))a01 )[9] = b01.i[3]; + ( (int* ALIGNED( 64 ))a01 )[10] = b02.i[3]; + ( (int* ALIGNED( 64 ))a01 )[11] = b03.i[3]; + ( (int* ALIGNED( 64 ))a01 )[12] = b04.i[3]; + ( (int* ALIGNED( 64 ))a01 )[13] = b05.i[3]; + ( (int* ALIGNED( 64 ))a01 )[14] = b06.i[3]; + ( (int* ALIGNED( 64 ))a01 )[15] = b07.i[3]; + + ( (int* ALIGNED( 64 ))a02 )[0] = b00.i[4]; + ( (int* ALIGNED( 64 ))a02 )[1] = b01.i[4]; + ( (int* ALIGNED( 64 ))a02 )[2] = b02.i[4]; + ( (int* ALIGNED( 64 ))a02 )[3] = b03.i[4]; + ( (int* ALIGNED( 64 ))a02 )[4] = b04.i[4]; + ( (int* ALIGNED( 64 ))a02 )[5] = b05.i[4]; + ( (int* ALIGNED( 64 ))a02 )[6] = b06.i[4]; + ( (int* ALIGNED( 64 ))a02 )[7] = b07.i[4]; + ( (int* ALIGNED( 64 ))a02 )[8] = b00.i[5]; + ( (int* ALIGNED( 64 ))a02 )[9] = b01.i[5]; + ( (int* ALIGNED( 64 ))a02 )[10] = b02.i[5]; + ( (int* ALIGNED( 64 ))a02 )[11] = b03.i[5]; + ( (int* ALIGNED( 64 ))a02 )[12] = b04.i[5]; + ( (int* ALIGNED( 64 ))a02 )[13] = b05.i[5]; + ( (int* ALIGNED( 64 ))a02 )[14] = b06.i[5]; + ( (int* ALIGNED( 64 ))a02 )[15] = b07.i[5]; + + ( (int* ALIGNED( 64 ))a03 )[0] = b00.i[6]; + ( (int* ALIGNED( 64 ))a03 )[1] = b01.i[6]; + ( (int* ALIGNED( 64 ))a03 )[2] = b02.i[6]; + ( (int* ALIGNED( 64 ))a03 )[3] = b03.i[6]; + ( (int* ALIGNED( 64 ))a03 )[4] = b04.i[6]; + ( (int* ALIGNED( 64 ))a03 )[5] = b05.i[6]; + ( (int* ALIGNED( 64 ))a03 )[6] = b06.i[6]; + ( (int* ALIGNED( 64 ))a03 )[7] = b07.i[6]; + ( (int* ALIGNED( 64 ))a03 )[8] = b00.i[7]; + ( (int* ALIGNED( 64 ))a03 )[9] = b01.i[7]; + ( (int* ALIGNED( 64 ))a03 )[10] = b02.i[7]; + ( (int* ALIGNED( 64 ))a03 )[11] = b03.i[7]; + ( (int* ALIGNED( 64 ))a03 )[12] = b04.i[7]; + ( (int* ALIGNED( 64 ))a03 )[13] = b05.i[7]; + ( (int* ALIGNED( 64 ))a03 )[14] = b06.i[7]; + ( (int* ALIGNED( 64 ))a03 )[15] = b07.i[7]; + + ( (int* ALIGNED( 64 ))a04 )[0] = b00.i[8]; + ( (int* ALIGNED( 64 ))a04 )[1] = b01.i[8]; + ( (int* ALIGNED( 64 ))a04 )[2] = b02.i[8]; + ( (int* ALIGNED( 64 ))a04 )[3] = b03.i[8]; + ( (int* ALIGNED( 64 ))a04 )[4] = b04.i[8]; + ( (int* ALIGNED( 64 ))a04 )[5] = b05.i[8]; + ( (int* ALIGNED( 64 ))a04 )[6] = b06.i[8]; + ( (int* ALIGNED( 64 ))a04 )[7] = b07.i[8]; + ( (int* ALIGNED( 64 ))a04 )[8] = b00.i[9]; + ( (int* ALIGNED( 64 ))a04 )[9] = b01.i[9]; + ( (int* ALIGNED( 64 ))a04 )[10] = b02.i[9]; + ( (int* ALIGNED( 64 ))a04 )[11] = b03.i[9]; + ( (int* ALIGNED( 64 ))a04 )[12] = b04.i[9]; + ( (int* ALIGNED( 64 ))a04 )[13] = b05.i[9]; + ( (int* ALIGNED( 64 ))a04 )[14] = b06.i[9]; + ( (int* ALIGNED( 64 ))a04 )[15] = b07.i[9]; + + ( (int* ALIGNED( 64 ))a05 )[0] = b00.i[10]; + ( (int* ALIGNED( 64 ))a05 )[1] = b01.i[10]; + ( (int* ALIGNED( 64 ))a05 )[2] = b02.i[10]; + ( (int* ALIGNED( 64 ))a05 )[3] = b03.i[10]; + ( (int* ALIGNED( 64 ))a05 )[4] = b04.i[10]; + ( (int* ALIGNED( 64 ))a05 )[5] = b05.i[10]; + ( (int* ALIGNED( 64 ))a05 )[6] = b06.i[10]; + ( (int* ALIGNED( 64 ))a05 )[7] = b07.i[10]; + ( (int* ALIGNED( 64 ))a05 )[8] = b00.i[11]; + ( (int* ALIGNED( 64 ))a05 )[9] = b01.i[11]; + ( (int* ALIGNED( 64 ))a05 )[10] = b02.i[11]; + ( (int* ALIGNED( 64 ))a05 )[11] = b03.i[11]; + ( (int* ALIGNED( 64 ))a05 )[12] = b04.i[11]; + ( (int* ALIGNED( 64 ))a05 )[13] = b05.i[11]; + ( (int* ALIGNED( 64 ))a05 )[14] = b06.i[11]; + ( (int* ALIGNED( 64 ))a05 )[15] = b07.i[11]; + + ( (int* ALIGNED( 64 ))a06 )[0] = b00.i[12]; + ( (int* ALIGNED( 64 ))a06 )[1] = b01.i[12]; + ( (int* ALIGNED( 64 ))a06 )[2] = b02.i[12]; + ( (int* ALIGNED( 64 ))a06 )[3] = b03.i[12]; + ( (int* ALIGNED( 64 ))a06 )[4] = b04.i[12]; + ( (int* ALIGNED( 64 ))a06 )[5] = b05.i[12]; + ( (int* ALIGNED( 64 ))a06 )[6] = b06.i[12]; + ( (int* ALIGNED( 64 ))a06 )[7] = b07.i[12]; + ( (int* ALIGNED( 64 ))a06 )[8] = b00.i[13]; + ( (int* ALIGNED( 64 ))a06 )[9] = b01.i[13]; + ( (int* ALIGNED( 64 ))a06 )[10] = b02.i[13]; + ( (int* ALIGNED( 64 ))a06 )[11] = b03.i[13]; + ( (int* ALIGNED( 64 ))a06 )[12] = b04.i[13]; + ( (int* ALIGNED( 64 ))a06 )[13] = b05.i[13]; + ( (int* ALIGNED( 64 ))a06 )[14] = b06.i[13]; + ( (int* ALIGNED( 64 ))a06 )[15] = b07.i[13]; + + ( (int* ALIGNED( 64 ))a07 )[0] = b00.i[14]; + ( (int* ALIGNED( 64 ))a07 )[1] = b01.i[14]; + ( (int* ALIGNED( 64 ))a07 )[2] = b02.i[14]; + ( (int* ALIGNED( 64 ))a07 )[3] = b03.i[14]; + ( (int* ALIGNED( 64 ))a07 )[4] = b04.i[14]; + ( (int* ALIGNED( 64 ))a07 )[5] = b05.i[14]; + ( (int* ALIGNED( 64 ))a07 )[6] = b06.i[14]; + ( (int* ALIGNED( 64 ))a07 )[7] = b07.i[14]; + ( (int* ALIGNED( 64 ))a07 )[8] = b00.i[15]; + ( (int* ALIGNED( 64 ))a07 )[9] = b01.i[15]; + ( (int* ALIGNED( 64 ))a07 )[10] = b02.i[15]; + ( (int* ALIGNED( 64 ))a07 )[11] = b03.i[15]; + ( (int* ALIGNED( 64 ))a07 )[12] = b04.i[15]; + ( (int* ALIGNED( 64 ))a07 )[13] = b05.i[15]; + ( (int* ALIGNED( 64 ))a07 )[14] = b06.i[15]; + ( (int* ALIGNED( 64 ))a07 )[15] = b07.i[15]; + + ( (int* ALIGNED( 64 ))a08 )[0] = b08.i[0]; + ( (int* ALIGNED( 64 ))a08 )[1] = b09.i[0]; + ( (int* ALIGNED( 64 ))a08 )[2] = b10.i[0]; + ( (int* ALIGNED( 64 ))a08 )[3] = b11.i[0]; + ( (int* ALIGNED( 64 ))a08 )[4] = b12.i[0]; + ( (int* ALIGNED( 64 ))a08 )[5] = b13.i[0]; + ( (int* ALIGNED( 64 ))a08 )[6] = b14.i[0]; + ( (int* ALIGNED( 64 ))a08 )[7] = b15.i[0]; + ( (int* ALIGNED( 64 ))a08 )[8] = b08.i[1]; + ( (int* ALIGNED( 64 ))a08 )[9] = b09.i[1]; + ( (int* ALIGNED( 64 ))a08 )[10] = b10.i[1]; + ( (int* ALIGNED( 64 ))a08 )[11] = b11.i[1]; + ( (int* ALIGNED( 64 ))a08 )[12] = b12.i[1]; + ( (int* ALIGNED( 64 ))a08 )[13] = b13.i[1]; + ( (int* ALIGNED( 64 ))a08 )[14] = b14.i[1]; + ( (int* ALIGNED( 64 ))a08 )[15] = b15.i[1]; + + ( (int* ALIGNED( 64 ))a09 )[0] = b08.i[2]; + ( (int* ALIGNED( 64 ))a09 )[1] = b09.i[2]; + ( (int* ALIGNED( 64 ))a09 )[2] = b10.i[2]; + ( (int* ALIGNED( 64 ))a09 )[3] = b11.i[2]; + ( (int* ALIGNED( 64 ))a09 )[4] = b12.i[2]; + ( (int* ALIGNED( 64 ))a09 )[5] = b13.i[2]; + ( (int* ALIGNED( 64 ))a09 )[6] = b14.i[2]; + ( (int* ALIGNED( 64 ))a09 )[7] = b15.i[2]; + ( (int* ALIGNED( 64 ))a09 )[8] = b08.i[3]; + ( (int* ALIGNED( 64 ))a09 )[9] = b09.i[3]; + ( (int* ALIGNED( 64 ))a09 )[10] = b10.i[3]; + ( (int* ALIGNED( 64 ))a09 )[11] = b11.i[3]; + ( (int* ALIGNED( 64 ))a09 )[12] = b12.i[3]; + ( (int* ALIGNED( 64 ))a09 )[13] = b13.i[3]; + ( (int* ALIGNED( 64 ))a09 )[14] = b14.i[3]; + ( (int* ALIGNED( 64 ))a09 )[15] = b15.i[3]; + + ( (int* ALIGNED( 64 ))a10 )[0] = b08.i[4]; + ( (int* ALIGNED( 64 ))a10 )[1] = b09.i[4]; + ( (int* ALIGNED( 64 ))a10 )[2] = b10.i[4]; + ( (int* ALIGNED( 64 ))a10 )[3] = b11.i[4]; + ( (int* ALIGNED( 64 ))a10 )[4] = b12.i[4]; + ( (int* ALIGNED( 64 ))a10 )[5] = b13.i[4]; + ( (int* ALIGNED( 64 ))a10 )[6] = b14.i[4]; + ( (int* ALIGNED( 64 ))a10 )[7] = b15.i[4]; + ( (int* ALIGNED( 64 ))a10 )[8] = b08.i[5]; + ( (int* ALIGNED( 64 ))a10 )[9] = b09.i[5]; + ( (int* ALIGNED( 64 ))a10 )[10] = b10.i[5]; + ( (int* ALIGNED( 64 ))a10 )[11] = b11.i[5]; + ( (int* ALIGNED( 64 ))a10 )[12] = b12.i[5]; + ( (int* ALIGNED( 64 ))a10 )[13] = b13.i[5]; + ( (int* ALIGNED( 64 ))a10 )[14] = b14.i[5]; + ( (int* ALIGNED( 64 ))a10 )[15] = b15.i[5]; + + ( (int* ALIGNED( 64 ))a11 )[0] = b08.i[6]; + ( (int* ALIGNED( 64 ))a11 )[1] = b09.i[6]; + ( (int* ALIGNED( 64 ))a11 )[2] = b10.i[6]; + ( (int* ALIGNED( 64 ))a11 )[3] = b11.i[6]; + ( (int* ALIGNED( 64 ))a11 )[4] = b12.i[6]; + ( (int* ALIGNED( 64 ))a11 )[5] = b13.i[6]; + ( (int* ALIGNED( 64 ))a11 )[6] = b14.i[6]; + ( (int* ALIGNED( 64 ))a11 )[7] = b15.i[6]; + ( (int* ALIGNED( 64 ))a11 )[8] = b08.i[7]; + ( (int* ALIGNED( 64 ))a11 )[9] = b09.i[7]; + ( (int* ALIGNED( 64 ))a11 )[10] = b10.i[7]; + ( (int* ALIGNED( 64 ))a11 )[11] = b11.i[7]; + ( (int* ALIGNED( 64 ))a11 )[12] = b12.i[7]; + ( (int* ALIGNED( 64 ))a11 )[13] = b13.i[7]; + ( (int* ALIGNED( 64 ))a11 )[14] = b14.i[7]; + ( (int* ALIGNED( 64 ))a11 )[15] = b15.i[7]; + + ( (int* ALIGNED( 64 ))a12 )[0] = b08.i[8]; + ( (int* ALIGNED( 64 ))a12 )[1] = b09.i[8]; + ( (int* ALIGNED( 64 ))a12 )[2] = b10.i[8]; + ( (int* ALIGNED( 64 ))a12 )[3] = b11.i[8]; + ( (int* ALIGNED( 64 ))a12 )[4] = b12.i[8]; + ( (int* ALIGNED( 64 ))a12 )[5] = b13.i[8]; + ( (int* ALIGNED( 64 ))a12 )[6] = b14.i[8]; + ( (int* ALIGNED( 64 ))a12 )[7] = b15.i[8]; + ( (int* ALIGNED( 64 ))a12 )[8] = b08.i[9]; + ( (int* ALIGNED( 64 ))a12 )[9] = b09.i[9]; + ( (int* ALIGNED( 64 ))a12 )[10] = b10.i[9]; + ( (int* ALIGNED( 64 ))a12 )[11] = b11.i[9]; + ( (int* ALIGNED( 64 ))a12 )[12] = b12.i[9]; + ( (int* ALIGNED( 64 ))a12 )[13] = b13.i[9]; + ( (int* ALIGNED( 64 ))a12 )[14] = b14.i[9]; + ( (int* ALIGNED( 64 ))a12 )[15] = b15.i[9]; + + ( (int* ALIGNED( 64 ))a13 )[0] = b08.i[10]; + ( (int* ALIGNED( 64 ))a13 )[1] = b09.i[10]; + ( (int* ALIGNED( 64 ))a13 )[2] = b10.i[10]; + ( (int* ALIGNED( 64 ))a13 )[3] = b11.i[10]; + ( (int* ALIGNED( 64 ))a13 )[4] = b12.i[10]; + ( (int* ALIGNED( 64 ))a13 )[5] = b13.i[10]; + ( (int* ALIGNED( 64 ))a13 )[6] = b14.i[10]; + ( (int* ALIGNED( 64 ))a13 )[7] = b15.i[10]; + ( (int* ALIGNED( 64 ))a13 )[8] = b08.i[11]; + ( (int* ALIGNED( 64 ))a13 )[9] = b09.i[11]; + ( (int* ALIGNED( 64 ))a13 )[10] = b10.i[11]; + ( (int* ALIGNED( 64 ))a13 )[11] = b11.i[11]; + ( (int* ALIGNED( 64 ))a13 )[12] = b12.i[11]; + ( (int* ALIGNED( 64 ))a13 )[13] = b13.i[11]; + ( (int* ALIGNED( 64 ))a13 )[14] = b14.i[11]; + ( (int* ALIGNED( 64 ))a13 )[15] = b15.i[11]; + + ( (int* ALIGNED( 64 ))a14 )[0] = b08.i[12]; + ( (int* ALIGNED( 64 ))a14 )[1] = b09.i[12]; + ( (int* ALIGNED( 64 ))a14 )[2] = b10.i[12]; + ( (int* ALIGNED( 64 ))a14 )[3] = b11.i[12]; + ( (int* ALIGNED( 64 ))a14 )[4] = b12.i[12]; + ( (int* ALIGNED( 64 ))a14 )[5] = b13.i[12]; + ( (int* ALIGNED( 64 ))a14 )[6] = b14.i[12]; + ( (int* ALIGNED( 64 ))a14 )[7] = b15.i[12]; + ( (int* ALIGNED( 64 ))a14 )[8] = b08.i[13]; + ( (int* ALIGNED( 64 ))a14 )[9] = b09.i[13]; + ( (int* ALIGNED( 64 ))a14 )[10] = b10.i[13]; + ( (int* ALIGNED( 64 ))a14 )[11] = b11.i[13]; + ( (int* ALIGNED( 64 ))a14 )[12] = b12.i[13]; + ( (int* ALIGNED( 64 ))a14 )[13] = b13.i[13]; + ( (int* ALIGNED( 64 ))a14 )[14] = b14.i[13]; + ( (int* ALIGNED( 64 ))a14 )[15] = b15.i[13]; + + ( (int* ALIGNED( 64 ))a15 )[0] = b08.i[14]; + ( (int* ALIGNED( 64 ))a15 )[1] = b09.i[14]; + ( (int* ALIGNED( 64 ))a15 )[2] = b10.i[14]; + ( (int* ALIGNED( 64 ))a15 )[3] = b11.i[14]; + ( (int* ALIGNED( 64 ))a15 )[4] = b12.i[14]; + ( (int* ALIGNED( 64 ))a15 )[5] = b13.i[14]; + ( (int* ALIGNED( 64 ))a15 )[6] = b14.i[14]; + ( (int* ALIGNED( 64 ))a15 )[7] = b15.i[14]; + ( (int* ALIGNED( 64 ))a15 )[8] = b08.i[15]; + ( (int* ALIGNED( 64 ))a15 )[9] = b09.i[15]; + ( (int* ALIGNED( 64 ))a15 )[10] = b10.i[15]; + ( (int* ALIGNED( 64 ))a15 )[11] = b11.i[15]; + ( (int* ALIGNED( 64 ))a15 )[12] = b12.i[15]; + ( (int* ALIGNED( 64 ))a15 )[13] = b13.i[15]; + ( (int* ALIGNED( 64 ))a15 )[14] = b14.i[15]; + ( (int* ALIGNED( 64 ))a15 )[15] = b15.i[15]; +} + +////////////// +// v16int class + +class v16int : public v16 +{ // v16int prefix unary operator friends - friend inline v16int operator +( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator ~( const v16int & a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16int & a ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator~( const v16int& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16int prefix increment / decrement operator friends - friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a ) ALWAYS_INLINE; // v16int postfix increment / decrement operator friends - friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE; - friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE; + friend inline v16int operator++( v16int& a, int ) ALWAYS_INLINE; + friend inline v16int operator--( v16int& a, int ) ALWAYS_INLINE; // v16int binary operator friends - friend inline v16int operator +( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator -( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator *( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator /( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator %( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ^( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator |( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator+( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator-( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator*(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator/( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator%( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator^( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&(const v16int& a, + const v16int& b)ALWAYS_INLINE; + friend inline v16int operator|( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int logical operator friends - friend inline v16int operator <( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16int& a, + const v16int& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16int& a, + const v16int& b ) ALWAYS_INLINE; // v16int miscellaneous friends - friend inline v16int abs( const v16int &a ) ALWAYS_INLINE; - friend inline v16 czero( const v16int &c, const v16 &a ) ALWAYS_INLINE; - friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE; + friend inline v16int abs( const v16int& a ) ALWAYS_INLINE; + friend inline v16 czero( const v16int& c, const v16& a ) ALWAYS_INLINE; + friend inline v16 notczero( const v16int& c, const v16& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE; + friend inline v16 merge( const v16int& c, const v16& t, + const v16& f ) ALWAYS_INLINE; // v16float unary operator friends - friend inline v16int operator !( const v16float & a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float miscellaneous friends - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; public: - // v16int constructors / destructors - v16int() {} // Default constructor + v16int() {} // Default constructor - v16int( const v16int &a ) // Copy constructor + v16int( const v16int& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + i[j] = a.i[j]; } - v16int( const v16 &a ) // Init from mixed + v16int( const v16& a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + i[j] = a.i[j]; } - v16int( int a ) // Init from scalar + v16int( int a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - i[j] = a; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + i[j] = a; } - v16int( int i00, int i01, int i02, int i03, - int i04, int i05, int i06, int i07, - int i08, int i09, int i10, int i11, - int i12, int i13, int i14, int i15 ) // Init from scalars + v16int( int i00, int i01, int i02, int i03, int i04, int i05, int i06, + int i07, int i08, int i09, int i10, int i11, int i12, int i13, + int i14, int i15 ) // Init from scalars { - i[ 0] = i00; i[ 1] = i01; i[ 2] = i02; i[ 3] = i03; - i[ 4] = i04; i[ 5] = i05; i[ 6] = i06; i[ 7] = i07; - i[ 8] = i08; i[ 9] = i09; i[10] = i10; i[11] = i11; - i[12] = i12; i[13] = i13; i[14] = i14; i[15] = i15; + i[0] = i00; + i[1] = i01; + i[2] = i02; + i[3] = i03; + i[4] = i04; + i[5] = i05; + i[6] = i06; + i[7] = i07; + i[8] = i08; + i[9] = i09; + i[10] = i10; + i[11] = i11; + i[12] = i12; + i[13] = i13; + i[14] = i14; + i[15] = i15; } - ~v16int() {} // Destructor + ~v16int() {} // Destructor // v16int assignment operators -# define ASSIGN(op) \ - inline v16int &operator op( const v16int &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - i[j] op b.i[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16int& operator op( const v16int& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + i[j] op b.i[j]; \ + return *this; \ } - ASSIGN( =) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v16int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; - } - }; + inline int operator()( int n ) { return i[n]; } +}; - // v16int prefix unary operators +// v16int prefix unary operators -# define PREFIX_UNARY(op) \ - inline v16int operator op( const v16int & a ) \ - { \ - v16int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } +#define PREFIX_UNARY( op ) \ + inline v16int operator op( const v16int& a ) \ + { \ + v16int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } - PREFIX_UNARY(+) - PREFIX_UNARY(-) +PREFIX_UNARY( +) +PREFIX_UNARY( -) - inline v16int operator !( const v16int & a ) - { +inline v16int operator!( const v16int& a ) +{ v16int b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = - ( !a.i[j] ); + for ( int j = 0; j < 16; j++ ) + b.i[j] = -( !a.i[j] ); return b; - } - - PREFIX_UNARY(~) - -# undef PREFIX_UNARY - - // v16int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v16int operator op( v16int & a ) \ - { \ - v16int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v16int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v16int operator op( v16int & a, int ) \ - { \ - v16int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - b.i[j] = ( a.i[j] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v16int binary operators - -# define BINARY(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - c.i[j] = a.i[j] op b.i[j]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v16int logical operators - -# define LOGICAL(op) \ - inline v16int operator op( const v16int &a, const v16int &b ) \ - { \ - v16int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - c.i[j] = - ( a.i[j] op b.i[j] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v16int miscellaneous functions - - inline v16int abs( const v16int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v16int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a ) \ + { \ + v16int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v16int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v16int operator op( v16int& a, int ) \ + { \ + v16int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + b.i[j] = ( a.i[j] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v16int binary operators + +#define BINARY( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + c.i[j] = a.i[j] op b.i[j]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( ^) +BINARY( & ) +BINARY( | ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v16int logical operators + +#define LOGICAL( op ) \ + inline v16int operator op( const v16int& a, const v16int& b ) \ + { \ + v16int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + c.i[j] = -( a.i[j] op b.i[j] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v16int miscellaneous functions + +inline v16int abs( const v16int& a ) +{ v16int b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; return b; - } +} - inline v16 czero( const v16int &c, const v16 &a ) - { +inline v16 czero( const v16int& c, const v16& a ) +{ v16 b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = a.i[j] & ~c.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; return b; - } +} - inline v16 notczero( const v16int &c, const v16 &a ) - { +inline v16 notczero( const v16int& c, const v16& a ) +{ v16 b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = a.i[j] & c.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = a.i[j] & c.i[j]; return b; - } +} - inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) - { +inline v16 merge( const v16int& c, const v16& t, const v16& f ) +{ v16 m; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + for ( int j = 0; j < 16; j++ ) + m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return m; - } +} - //////////////// - // v16float class +//////////////// +// v16float class - class v16float : public v16 - { +class v16float : public v16 +{ // v16float prefix unary operator friends - friend inline v16float operator +( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a ) ALWAYS_INLINE; - friend inline v16float operator ~( const v16float &a ) ALWAYS_INLINE; - friend inline v16int operator !( const v16float &a ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a ) ALWAYS_INLINE; + friend inline v16float operator~( const v16float& a ) ALWAYS_INLINE; + friend inline v16int operator!( const v16float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v16float prefix increment / decrement operator friends - friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a ) ALWAYS_INLINE; // v16float postfix increment / decrement operator friends - friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE; - friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE; + friend inline v16float operator++( v16float& a, int ) ALWAYS_INLINE; + friend inline v16float operator--( v16float& a, int ) ALWAYS_INLINE; // v16float binary operator friends - friend inline v16float operator +( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator -( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator *( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16float operator /( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16float operator+( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator-( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16float operator*(const v16float& a, + const v16float& b)ALWAYS_INLINE; + friend inline v16float operator/( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float logical operator friends - friend inline v16int operator <( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE; - friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE; + friend inline v16int operator<( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator==( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator!=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator<=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator>=( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator&&( const v16float& a, + const v16float& b ) ALWAYS_INLINE; + friend inline v16int operator||( const v16float& a, + const v16float& b ) ALWAYS_INLINE; // v16float math library friends -# define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v16float fn( const v16float &a, \ - const v16float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v16float fn( const v16float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v16float fn( const v16float& a, const v16float& b ) \ + ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v16float miscellaneous friends - friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rsqrt ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE; - friend inline v16float rcp ( const v16float &a ) ALWAYS_INLINE; - friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE; - friend inline v16float clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE; - friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; - friend inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE; + friend inline v16float rsqrt_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rsqrt( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp_approx( const v16float& a ) ALWAYS_INLINE; + friend inline v16float rcp( const v16float& a ) ALWAYS_INLINE; + friend inline v16float fma( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float fnms( const v16float& a, const v16float& b, + const v16float& c ) ALWAYS_INLINE; + friend inline v16float clear_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float set_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline v16float toggle_bits( const v16int& m, + const v16float& a ) ALWAYS_INLINE; + friend inline void increment_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void decrement_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; + friend inline void scale_16x1( float* ALIGNED( 64 ) p, + const v16float& a ) ALWAYS_INLINE; public: - // v16float constructors / destructors - v16float() {} // Default constructor + v16float() {} // Default constructor - v16float( const v16float &a ) // Copy constructor + v16float( const v16float& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - f[j] = a.f[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + f[j] = a.f[j]; } - v16float( const v16 &a ) // Init from mixed + v16float( const v16& a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - f[j] = a.f[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + f[j] = a.f[j]; } - v16float( float a ) // Init from scalar + v16float( float a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - f[j] = a; + ALWAYS_VECTORIZE + for ( int j = 0; j < 16; j++ ) + f[j] = a; } - v16float( float f00, float f01, float f02, float f03, - float f04, float f05, float f06, float f07, - float f08, float f09, float f10, float f11, - float f12, float f13, float f14, float f15 ) // Init from scalars + v16float( float f00, float f01, float f02, float f03, float f04, float f05, + float f06, float f07, float f08, float f09, float f10, float f11, + float f12, float f13, float f14, float f15 ) // Init from scalars { - f[ 0] = f00; f[ 1] = f01; f[ 2] = f02; f[ 3] = f03; - f[ 4] = f04; f[ 5] = f05; f[ 6] = f06; f[ 7] = f07; - f[ 8] = f08; f[ 9] = f09; f[10] = f10; f[11] = f11; - f[12] = f12; f[13] = f13; f[14] = f14; f[15] = f15; + f[0] = f00; + f[1] = f01; + f[2] = f02; + f[3] = f03; + f[4] = f04; + f[5] = f05; + f[6] = f06; + f[7] = f07; + f[8] = f08; + f[9] = f09; + f[10] = f10; + f[11] = f11; + f[12] = f12; + f[13] = f13; + f[14] = f14; + f[15] = f15; } - ~v16float() {} // Destructor + ~v16float() {} // Destructor // v16float assignment operators -# define ASSIGN(op) \ - inline v16float &operator op( const v16float &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - f[j] op b.f[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v16float& operator op( const v16float& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + f[j] op b.f[j]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) -# undef ASSIGN +#undef ASSIGN // v16float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v16float prefix unary operators +// v16float prefix unary operators - inline v16float operator +( const v16float &a ) - { +inline v16float operator+( const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = +a.f[j]; + for ( int j = 0; j < 16; j++ ) + b.f[j] = +a.f[j]; return b; - } +} - inline v16float operator -( const v16float &a ) - { +inline v16float operator-( const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = -a.f[j]; + for ( int j = 0; j < 16; j++ ) + b.f[j] = -a.f[j]; return b; - } +} - inline v16int operator !( const v16float &a ) - { +inline v16int operator!( const v16float& a ) +{ v16int b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = a.i[j] ? 0 : -1; + for ( int j = 0; j < 16; j++ ) + b.i[j] = a.i[j] ? 0 : -1; return b; - } +} - // v16float prefix increment / decrement operators +// v16float prefix increment / decrement operators - inline v16float operator ++( v16float &a ) - { +inline v16float operator++( v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = ++a.f[j]; + for ( int j = 0; j < 16; j++ ) + b.f[j] = ++a.f[j]; return b; - } +} - inline v16float operator --( v16float &a ) - { +inline v16float operator--( v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = --a.f[j]; + for ( int j = 0; j < 16; j++ ) + b.f[j] = --a.f[j]; return b; - } +} - // v16float postfix increment / decrement operators +// v16float postfix increment / decrement operators - inline v16float operator ++( v16float &a, int ) - { +inline v16float operator++( v16float& a, int ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = a.f[j]++; + for ( int j = 0; j < 16; j++ ) + b.f[j] = a.f[j]++; return b; - } +} - inline v16float operator --( v16float &a, int ) - { +inline v16float operator--( v16float& a, int ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = a.f[j]--; + for ( int j = 0; j < 16; j++ ) + b.f[j] = a.f[j]--; return b; - } - - // v16float binary operators - -# define BINARY(op) \ - inline v16float operator op( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - c.f[j] = a.f[j] op b.f[j]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - -# undef BINARY - - // v16float logical operators - -# define LOGICAL(op) \ - inline v16int operator op( const v16float &a, const v16float &b ) \ - { \ - v16int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - c.i[j] = -( a.f[j] op b.f[j] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v16float math library functions - -# define CMATH_FR1(fn) \ - inline v16float fn( const v16float &a ) \ - { \ - v16float b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - b.f[j] = ::fn( a.f[j] ); \ - return b; \ - } - -# define CMATH_FR2(fn) \ - inline v16float fn( const v16float &a, const v16float &b ) \ - { \ - v16float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 16; j++ ) \ - c.f[j] = ::fn( a.f[j], b.f[j] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - inline v16float copysign( const v16float &a, const v16float &b ) - { +} + +// v16float binary operators + +#define BINARY( op ) \ + inline v16float operator op( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + c.f[j] = a.f[j] op b.f[j]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v16float logical operators + +#define LOGICAL( op ) \ + inline v16int operator op( const v16float& a, const v16float& b ) \ + { \ + v16int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + c.i[j] = -( a.f[j] op b.f[j] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v16float math library functions + +#define CMATH_FR1( fn ) \ + inline v16float fn( const v16float& a ) \ + { \ + v16float b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + b.f[j] = ::fn( a.f[j] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v16float fn( const v16float& a, const v16float& b ) \ + { \ + v16float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 16; j++ ) \ + c.f[j] = ::fn( a.f[j], b.f[j] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + + inline v16float + copysign( const v16float& a, const v16float& b ) +{ v16float c; float t; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) + for ( int j = 0; j < 16; j++ ) { - t = ::fabs( a.f[j] ); - if( b.f[j] < 0 ) t = -t; - c.f[j] = t; + t = ::fabs( a.f[j] ); + if ( b.f[j] < 0 ) + t = -t; + c.f[j] = t; } return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v16float miscellaneous functions +// v16float miscellaneous functions - inline v16float rsqrt_approx( const v16float &a ) - { +inline v16float rsqrt_approx( const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = ::sqrt( 1.0f/a.f[j] ); + for ( int j = 0; j < 16; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v16float rsqrt( const v16float &a ) - { +inline v16float rsqrt( const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = ::sqrt( 1.0f/a.f[j] ); + for ( int j = 0; j < 16; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v16float rcp_approx( const v16float &a ) - { +inline v16float rcp_approx( const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = 1.0f/a.f[j]; + for ( int j = 0; j < 16; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v16float rcp( const v16float &a ) - { +inline v16float rcp( const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.f[j] = 1.0f/a.f[j]; + for ( int j = 0; j < 16; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v16float fma( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fma( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - d.f[j] = a.f[j] * b.f[j] + c.f[j]; + for ( int j = 0; j < 16; j++ ) + d.f[j] = a.f[j] * b.f[j] + c.f[j]; return d; - } +} - inline v16float fms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - d.f[j] = a.f[j] * b.f[j] - c.f[j]; + for ( int j = 0; j < 16; j++ ) + d.f[j] = a.f[j] * b.f[j] - c.f[j]; return d; - } +} - inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) - { +inline v16float fnms( const v16float& a, const v16float& b, const v16float& c ) +{ v16float d; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - d.f[j] = c.f[j] - a.f[j] * b.f[j]; + for ( int j = 0; j < 16; j++ ) + d.f[j] = c.f[j] - a.f[j] * b.f[j]; return d; - } +} - inline v16float clear_bits( const v16int &m, const v16float &a ) - { +inline v16float clear_bits( const v16int& m, const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = ( ~m.i[j] ) & a.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = ( ~m.i[j] ) & a.i[j]; return b; - } +} - inline v16float set_bits( const v16int &m, const v16float &a ) - { +inline v16float set_bits( const v16int& m, const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = m.i[j] | a.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = m.i[j] | a.i[j]; return b; - } +} - inline v16float toggle_bits( const v16int &m, const v16float &a ) - { +inline v16float toggle_bits( const v16int& m, const v16float& a ) +{ v16float b; ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - b.i[j] = m.i[j] ^ a.i[j]; + for ( int j = 0; j < 16; j++ ) + b.i[j] = m.i[j] ^ a.i[j]; return b; - } +} - inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) - { +inline void increment_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - p[j] += a.f[j]; - } + for ( int j = 0; j < 16; j++ ) + p[j] += a.f[j]; +} - inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) - { +inline void decrement_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - p[j] -= a.f[j]; - } + for ( int j = 0; j < 16; j++ ) + p[j] -= a.f[j]; +} - inline void scale_16x1( float * ALIGNED(64) p, const v16float &a ) - { +inline void scale_16x1( float* ALIGNED( 64 ) p, const v16float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 16; j++ ) - p[j] *= a.f[j]; - } + for ( int j = 0; j < 16; j++ ) + p[j] *= a.f[j]; +} } // namespace v16 diff --git a/src/util/v4/v4.h b/src/util/v4/v4.h index 0b8cc4c1..3e9ba978 100644 --- a/src/util/v4/v4.h +++ b/src/util/v4/v4.h @@ -8,25 +8,25 @@ #ifdef __cplusplus -# if defined USE_V4_ALTIVEC -# include "v4_altivec.h" +#if defined USE_V4_ALTIVEC +#include "v4_altivec.h" -# elif defined USE_V4_PORTABLE -# include "v4_portable.h" +#elif defined USE_V4_PORTABLE +#include "v4_portable.h" -# elif defined USE_V4_SSE -# include "v4_sse.h" +#elif defined USE_V4_SSE +#include "v4_sse.h" -# elif defined USE_V4_AVX -# include "v4_avx.h" +#elif defined USE_V4_AVX +#include "v4_avx.h" -# elif defined USE_V4_AVX2 -# include "v4_avx2.h" +#elif defined USE_V4_AVX2 +#include "v4_avx2.h" -# elif defined USE_V4_NEON -# include "v4_neon.h" +#elif defined USE_V4_NEON +#include "v4_neon.h" -# endif +#endif #endif diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h index d9438fc1..fbbf437f 100644 --- a/src/util/v4/v4_altivec.h +++ b/src/util/v4/v4_altivec.h @@ -1,4 +1,4 @@ - #ifndef _v4_altivec_h_ +#ifndef _v4_altivec_h_ #define _v4_altivec_h_ #ifndef IN_v4_h @@ -12,7 +12,7 @@ #define V4_ALTIVEC_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif // See if this fixes a problem when compiling with GNU compilers. @@ -21,288 +21,270 @@ #undef vector #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; - - #define _v4_int __vector int - #define _v4_uint __vector unsigned int - #define _v4_float __vector float - #define _v16_uchar __vector unsigned char - - #define _PERM(i0,i1,i2,i3) \ - ( (_v16_uchar) { 4*(i0), 4*(i0)+1, 4*(i0)+2, 4*(i0)+3, \ - 4*(i1), 4*(i1)+1, 4*(i1)+2, 4*(i1)+3, \ - 4*(i2), 4*(i2)+1, 4*(i2)+2, 4*(i2)+3, \ - 4*(i3), 4*(i3)+1, 4*(i3)+2, 4*(i3)+3 } ) - - // FIXME: IS IT FASTER TO SPLAT THESE ON THE FLY - - const _v4_int _false = { 0, 0, 0, 0 }; - const _v4_int _true = { -1, -1, -1, -1 }; - const _v4_int _ione = { 1, 1, 1, 1 }; - - const _v4_float _zero = { 0.0f, 0.0f, 0.0f, 0.0f }; - const _v4_float _half = { 0.5f, 0.5f, 0.5f, 0.5f }; - const _v4_float _one = { 1.0f, 1.0f, 1.0f, 1.0f }; - const _v4_float _sign = { -0.0f, -0.0f, -0.0f, -0.0f }; - const _v4_float _n02 = { -0.0f, +0.0f, -0.0f, +0.0f }; - - //////////////// - // v4 base class - - class v4 - { +class v4; +class v4int; +class v4float; + +#define _v4_int __vector int +#define _v4_uint __vector unsigned int +#define _v4_float __vector float +#define _v16_uchar __vector unsigned char + +#define _PERM( i0, i1, i2, i3 ) \ + ( ( _v16_uchar ){ \ + 4 * ( i0 ), 4 * ( i0 ) + 1, 4 * ( i0 ) + 2, 4 * ( i0 ) + 3, \ + 4 * ( i1 ), 4 * ( i1 ) + 1, 4 * ( i1 ) + 2, 4 * ( i1 ) + 3, \ + 4 * ( i2 ), 4 * ( i2 ) + 1, 4 * ( i2 ) + 2, 4 * ( i2 ) + 3, \ + 4 * ( i3 ), 4 * ( i3 ) + 1, 4 * ( i3 ) + 2, 4 * ( i3 ) + 3} ) + +// FIXME: IS IT FASTER TO SPLAT THESE ON THE FLY + +const _v4_int _false = {0, 0, 0, 0}; +const _v4_int _true = {-1, -1, -1, -1}; +const _v4_int _ione = {1, 1, 1, 1}; + +const _v4_float _zero = {0.0f, 0.0f, 0.0f, 0.0f}; +const _v4_float _half = {0.5f, 0.5f, 0.5f, 0.5f}; +const _v4_float _one = {1.0f, 1.0f, 1.0f, 1.0f}; +const _v4_float _sign = {-0.0f, -0.0f, -0.0f, -0.0f}; +const _v4_float _n02 = {-0.0f, +0.0f, -0.0f, +0.0f}; + +//////////////// +// v4 base class + +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: public: // wdn - _v4_float v; - - public: - v4() {} // Default constructor + public: + v4() {} // Default constructor - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return vec_any_ne( (_v4_int) a.v, _false ); - } +inline int any( const v4& a ) { return vec_any_ne( (_v4_int)a.v, _false ); } - inline int all( const v4 &a ) - { - return vec_all_ne( (_v4_int) a.v, _false ); - } +inline int all( const v4& a ) { return vec_all_ne( (_v4_int)a.v, _false ); } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; b.v = vec_splat( a.v, n ); return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; b.v = vec_perm( a.v, a.v, _PERM( i0, i1, i2, i3 ) ); return b; - } +} - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ _v4_float t = a.v; a.v = b.v; b.v = t; - } +} - inline void transpose( v4 &a, v4 &b, v4 &c, v4 &d ) - { - _v4_float a0 = a.v; // a0 = 0 1 2 3 - _v4_float b0 = b.v; // b0 = 4 5 6 7 - _v4_float c1 = c.v; // c1 = 8 9 10 11 - _v4_float d1 = d.v; // d1 = 12 13 14 15 +inline void transpose( v4& a, v4& b, v4& c, v4& d ) +{ + _v4_float a0 = a.v; // a0 = 0 1 2 3 + _v4_float b0 = b.v; // b0 = 4 5 6 7 + _v4_float c1 = c.v; // c1 = 8 9 10 11 + _v4_float d1 = d.v; // d1 = 12 13 14 15 // Step 1: Interleave top and bottom half _v4_float a1 = vec_mergeh( a0, c1 ); // a1 = 0 8 1 9 _v4_float b1 = vec_mergeh( b0, d1 ); // b1 = 4 12 5 13 - c1 = vec_mergel( a0, c1 ); // c1 = 2 10 3 11 - d1 = vec_mergel( b0, d1 ); // d1 = 6 14 7 15 + c1 = vec_mergel( a0, c1 ); // c1 = 2 10 3 11 + d1 = vec_mergel( b0, d1 ); // d1 = 6 14 7 15 // Step 2: Interleave even and odd rows - a.v = vec_mergeh( a1, b1 ); // a = 0 4 8 12 - b.v = vec_mergel( a1, b1 ); // b = 1 5 9 13 - c.v = vec_mergeh( c1, d1 ); // c = 2 6 10 14 - d.v = vec_mergel( c1, d1 ); // d = 3 7 11 15 - } - - // v4 memory manipulation functions + a.v = vec_mergeh( a1, b1 ); // a = 0 4 8 12 + b.v = vec_mergel( a1, b1 ); // b = 1 5 9 13 + c.v = vec_mergeh( c1, d1 ); // c = 2 6 10 14 + d.v = vec_mergel( c1, d1 ); // d = 3 7 11 15 +} - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.v = vec_ld( 0, ( const float * ) p ); - } +// v4 memory manipulation functions - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - vec_st( a.v, 0, ( float * ) p ); - } +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.v = vec_ld( 0, (const float*)p ); +} - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - vec_stl( a.v, 0, ( float * ) p ); - } +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + vec_st( a.v, 0, (float*)p ); +} - inline void clear_4x1( void * ALIGNED(16) p ) - { - vec_st( _zero, 0, ( float * ) p ); - } +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + vec_stl( a.v, 0, (float*)p ); +} - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - vec_st( vec_ld( 0, ( const float * ) src ), 0, ( float * ) dst ); - } +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + vec_st( _zero, 0, (float*)p ); +} - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { - _v4_float va = vec_ld( 0, ( float * ) a ); - _v4_float vb = vec_ld( 0, ( float * ) b ); +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + vec_st( vec_ld( 0, (const float*)src ), 0, (float*)dst ); +} - vec_st( vb, 0, ( float * ) a ); - vec_st( va, 0, ( float * ) b ); - } +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ + _v4_float va = vec_ld( 0, (float*)a ); + _v4_float vb = vec_ld( 0, (float*)b ); - // v4 transposed memory manipulation functions + vec_st( vb, 0, (float*)a ); + vec_st( va, 0, (float*)b ); +} - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.v = (_v4_float){ ( (const float *) a0 )[0], - ( (const float *) a1 )[0], - ( (const float *) a2 )[0], - ( (const float *) a3 )[0] }; - } +// v4 transposed memory manipulation functions - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.v = ( _v4_float ){( (const float*)a0 )[0], ( (const float*)a1 )[0], + ( (const float*)a2 )[0], ( (const float*)a3 )[0]}; +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ _v4_float r, s, t, u; - a.v = vec_ld( 0, (const float *) a0 ); // a = 0 1 2 3 - b.v = vec_ld( 0, (const float *) a1 ); // b = 4 5 6 7 - t = vec_ld( 0, (const float *) a2 ); // c = 8 9 10 11 - u = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 15 + a.v = vec_ld( 0, (const float*)a0 ); // a = 0 1 2 3 + b.v = vec_ld( 0, (const float*)a1 ); // b = 4 5 6 7 + t = vec_ld( 0, (const float*)a2 ); // c = 8 9 10 11 + u = vec_ld( 0, (const float*)a3 ); // d = 12 13 14 15 // Step 1: Interleave top and bottom half - r = vec_mergeh( a.v, t ); // r = 0 8 1 9 - s = vec_mergeh( b.v, u ); // s = 4 12 5 13 + r = vec_mergeh( a.v, t ); // r = 0 8 1 9 + s = vec_mergeh( b.v, u ); // s = 4 12 5 13 // Step 2: Interleave even and odd rows - a.v = vec_mergeh( r, s ); // a = 0 4 8 12 - b.v = vec_mergel( r, s ); // b = 1 5 9 13 + a.v = vec_mergeh( r, s ); // a = 0 4 8 12 + b.v = vec_mergel( r, s ); // b = 1 5 9 13 // _v4_float r = vec_ld( 0, (const float *) a0 ); // r = 0 1 2 3 // _v4_float s = vec_ld( 0, (const float *) a1 ); // s = 4 5 6 7 @@ -318,123 +300,106 @@ namespace v4 // a.v = vec_mergeh( w, x ); // a = 0 4 8 12 // b.v = vec_mergel( w, x ); // b = 1 5 9 13 - } +} - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ _v4_float r, s, t, u, d_v; - a.v = vec_ld( 0, (const float *) a0 ); // a = 0 1 2 x - b.v = vec_ld( 0, (const float *) a1 ); // b = 4 5 6 x - c.v = vec_ld( 0, (const float *) a2 ); // c = 8 9 10 x - d_v = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 x + a.v = vec_ld( 0, (const float*)a0 ); // a = 0 1 2 x + b.v = vec_ld( 0, (const float*)a1 ); // b = 4 5 6 x + c.v = vec_ld( 0, (const float*)a2 ); // c = 8 9 10 x + d_v = vec_ld( 0, (const float*)a3 ); // d = 12 13 14 x // Step 1: Interleave top and bottom half - r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 - s = vec_mergeh( b.v, d_v ); // s = 4 12 5 13 + r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 + s = vec_mergeh( b.v, d_v ); // s = 4 12 5 13 - t = vec_mergel( a.v, c.v ); // t = 2 10 x x - u = vec_mergel( b.v, d_v ); // u = 6 14 x x + t = vec_mergel( a.v, c.v ); // t = 2 10 x x + u = vec_mergel( b.v, d_v ); // u = 6 14 x x // Step 2: Interleave even and odd rows - a.v = vec_mergeh( r, s ); // a = 0 4 8 12 - b.v = vec_mergel( r, s ); // b = 1 5 9 13 - c.v = vec_mergeh( t, u ); // c = 2 6 10 14 - } + a.v = vec_mergeh( r, s ); // a = 0 4 8 12 + b.v = vec_mergel( r, s ); // b = 1 5 9 13 + c.v = vec_mergeh( t, u ); // c = 2 6 10 14 +} - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ _v4_float r, s, t, u; - a.v = vec_ld( 0, (const float *) a0 ); // a = 0 1 2 3 - b.v = vec_ld( 0, (const float *) a1 ); // b = 4 5 6 7 - c.v = vec_ld( 0, (const float *) a2 ); // c = 8 9 10 11 - d.v = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 15 + a.v = vec_ld( 0, (const float*)a0 ); // a = 0 1 2 3 + b.v = vec_ld( 0, (const float*)a1 ); // b = 4 5 6 7 + c.v = vec_ld( 0, (const float*)a2 ); // c = 8 9 10 11 + d.v = vec_ld( 0, (const float*)a3 ); // d = 12 13 14 15 // Step 1: Interleave top and bottom half - r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 - s = vec_mergeh( b.v, d.v ); // s = 4 12 5 13 + r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 + s = vec_mergeh( b.v, d.v ); // s = 4 12 5 13 - t = vec_mergel( a.v, c.v ); // t = 2 10 3 11 - u = vec_mergel( b.v, d.v ); // u = 6 14 7 15 + t = vec_mergel( a.v, c.v ); // t = 2 10 3 11 + u = vec_mergel( b.v, d.v ); // u = 6 14 7 15 // Step 2: Interleave even and odd rows - a.v = vec_mergeh( r, s ); // a = 0 4 8 12 - b.v = vec_mergel( r, s ); // b = 1 5 9 13 - c.v = vec_mergeh( t, u ); // c = 2 6 10 14 - d.v = vec_mergel( t, u ); // d = 3 7 11 15 - } - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - vec_ste( vec_splat( a.v, 0 ), 0, (float *) a0 ); - vec_ste( vec_splat( a.v, 1 ), 0, (float *) a1 ); - vec_ste( vec_splat( a.v, 2 ), 0, (float *) a2 ); - vec_ste( vec_splat( a.v, 3 ), 0, (float *) a3 ); - } + a.v = vec_mergeh( r, s ); // a = 0 4 8 12 + b.v = vec_mergel( r, s ); // b = 1 5 9 13 + c.v = vec_mergeh( t, u ); // c = 2 6 10 14 + d.v = vec_mergel( t, u ); // d = 3 7 11 15 +} - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + vec_ste( vec_splat( a.v, 0 ), 0, (float*)a0 ); + vec_ste( vec_splat( a.v, 1 ), 0, (float*)a1 ); + vec_ste( vec_splat( a.v, 2 ), 0, (float*)a2 ); + vec_ste( vec_splat( a.v, 3 ), 0, (float*)a3 ); +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ _v4_float t; - t = vec_perm( a.v, b.v, _PERM(0,4,0,4) ); + t = vec_perm( a.v, b.v, _PERM( 0, 4, 0, 4 ) ); - vec_ste( t, 0, (float *) a0 ); - vec_ste( t, 4, (float *) a0 ); + vec_ste( t, 0, (float*)a0 ); + vec_ste( t, 4, (float*)a0 ); - t = vec_perm( a.v, b.v, _PERM(1,5,1,5) ); + t = vec_perm( a.v, b.v, _PERM( 1, 5, 1, 5 ) ); - vec_ste( t, 0, (float *) a1 ); - vec_ste( t, 4, (float *) a1 ); + vec_ste( t, 0, (float*)a1 ); + vec_ste( t, 4, (float*)a1 ); - t = vec_perm( a.v, b.v, _PERM(2,6,2,6) ); + t = vec_perm( a.v, b.v, _PERM( 2, 6, 2, 6 ) ); - vec_ste( t, 0, (float *) a2 ); - vec_ste( t, 4, (float *) a2 ); + vec_ste( t, 0, (float*)a2 ); + vec_ste( t, 4, (float*)a2 ); - t = vec_perm( a.v, b.v, _PERM(3,7,3,7) ); + t = vec_perm( a.v, b.v, _PERM( 3, 7, 3, 7 ) ); - vec_ste( t, 0, (float *) a3 ); - vec_ste( t, 4, (float *) a3 ); - } + vec_ste( t, 0, (float*)a3 ); + vec_ste( t, 4, (float*)a3 ); +} - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - _v4_float a_v = a.v; // a = 0 1 2 3 - _v4_float b_v = b.v; // b = 4 5 6 7 - _v4_float c_v = c.v; // c = 8 9 10 11 +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + _v4_float a_v = a.v; // a = 0 1 2 3 + _v4_float b_v = b.v; // b = 4 5 6 7 + _v4_float c_v = c.v; // c = 8 9 10 11 _v4_float t, u, v; @@ -443,251 +408,290 @@ namespace v4 v = vec_mergeh( t, u ); - vec_ste( v, 0, (float *) a0 ); - vec_ste( v, 4, (float *) a0 ); - vec_ste( v, 8, (float *) a0 ); + vec_ste( v, 0, (float*)a0 ); + vec_ste( v, 4, (float*)a0 ); + vec_ste( v, 8, (float*)a0 ); v = vec_mergel( t, u ); - vec_ste( v, 0, (float *) a1 ); - vec_ste( v, 4, (float *) a1 ); - vec_ste( v, 8, (float *) a1 ); + vec_ste( v, 0, (float*)a1 ); + vec_ste( v, 4, (float*)a1 ); + vec_ste( v, 8, (float*)a1 ); t = vec_mergel( a_v, c_v ); // t = 2 10 3 11 u = vec_mergel( b_v, b_v ); // u = 6 x 7 x v = vec_mergeh( t, u ); - vec_ste( v, 0, (float *) a2 ); - vec_ste( v, 4, (float *) a2 ); - vec_ste( v, 8, (float *) a2 ); + vec_ste( v, 0, (float*)a2 ); + vec_ste( v, 4, (float*)a2 ); + vec_ste( v, 8, (float*)a2 ); v = vec_mergel( t, u ); - vec_ste( v, 0, (float *) a3 ); - vec_ste( v, 4, (float *) a3 ); - vec_ste( v, 8, (float *) a3 ); - } + vec_ste( v, 0, (float*)a3 ); + vec_ste( v, 4, (float*)a3 ); + vec_ste( v, 8, (float*)a3 ); +} - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ _v4_float r, s, t, u; - // a = 0 1 2 3 - // b = 4 5 6 7 - // c = 8 9 10 11 - // d = 12 13 14 15 + // a = 0 1 2 3 + // b = 4 5 6 7 + // c = 8 9 10 11 + // d = 12 13 14 15 // Step 1: Interleave top and bottom half - r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 - s = vec_mergeh( b.v, d.v ); // s = 4 12 5 13 - t = vec_mergel( a.v, c.v ); // t = 2 10 3 11 - u = vec_mergel( b.v, d.v ); // u = 6 14 7 15 + r = vec_mergeh( a.v, c.v ); // r = 0 8 1 9 + s = vec_mergeh( b.v, d.v ); // s = 4 12 5 13 + t = vec_mergel( a.v, c.v ); // t = 2 10 3 11 + u = vec_mergel( b.v, d.v ); // u = 6 14 7 15 // Step 2: Interleave even and odd rows - vec_st( vec_mergeh( r, s ), 0, (float *) a0 ); // a0 = 0 4 8 12 - vec_st( vec_mergel( r, s ), 0, (float *) a1 ); // a1 = 1 5 9 13 - vec_st( vec_mergeh( t, u ), 0, (float *) a2 ); // a2 = 2 6 10 14 - vec_st( vec_mergel( t, u ), 0, (float *) a3 ); // a3 = 3 7 11 15 - } + vec_st( vec_mergeh( r, s ), 0, (float*)a0 ); // a0 = 0 4 8 12 + vec_st( vec_mergel( r, s ), 0, (float*)a1 ); // a1 = 1 5 9 13 + vec_st( vec_mergeh( t, u ), 0, (float*)a2 ); // a2 = 2 6 10 14 + vec_st( vec_mergel( t, u ), 0, (float*)a3 ); // a3 = 3 7 11 15 +} - ////////////// - // v4int class +////////////// +// v4int class - class v4int : public v4 - { +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - v = (_v4_float) ( (_v4_int) { a, a, a, a } ); + v = ( _v4_float )( ( _v4_int ){a, a, a, a} ); } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - v = (_v4_float) ( (_v4_int) { i0, i1, i2, i3 } ); + v = ( _v4_float )( ( _v4_int ){i0, i1, i2, i3} ); } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op,instr) \ - inline v4int &operator op( const v4int &b ) \ - { \ - instr; \ - return *this; \ +#define ASSIGN( op, instr ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + instr; \ + return *this; \ } - ASSIGN(=, v = b.v ) - ASSIGN(+=, v = (_v4_float)vec_add( (_v4_int)v, (_v4_int)b.v ) ) - ASSIGN(-=, v = (_v4_float)vec_sub( (_v4_int)v, (_v4_int)b.v ) ) - ASSIGN(*=, union { int i[4]; _v4_float v; } t; - union { int i[4]; _v4_float v; } u; - t.v = v; u.v = b.v; - v = (_v4_float)((_v4_int){ t.i[0]*u.i[0], - t.i[1]*u.i[1], - t.i[2]*u.i[2], - t.i[3]*u.i[3] }) ) // FIXME: Sigh ... - ASSIGN(/=, union { int i[4]; _v4_float v; } t; - union { int i[4]; _v4_float v; } u; - t.v = v; u.v = b.v; - v = (_v4_float)((_v4_int){ t.i[0]/u.i[0], - t.i[1]/u.i[1], - t.i[2]/u.i[2], - t.i[3]/u.i[3] }) ) // FIXME: Sigh ... - ASSIGN(%=, union { int i[4]; _v4_float v; } t; - union { int i[4]; _v4_float v; } u; - t.v = v; u.v = b.v; - v = (_v4_float)((_v4_int){ t.i[0]%u.i[0], - t.i[1]%u.i[1], - t.i[2]%u.i[2], - t.i[3]%u.i[3] }) ) // FIXME: Sigh ... - ASSIGN(^=, v = (_v4_float)vec_xor( (_v4_int)v, (_v4_int) b.v ) ) - ASSIGN(&=, v = (_v4_float)vec_and( (_v4_int)v, (_v4_int) b.v ) ) - ASSIGN(|=, v = (_v4_float)vec_or( (_v4_int)v, (_v4_int) b.v ) ) - ASSIGN(<<=, v = (_v4_float)vec_sl( (_v4_int)v, (_v4_uint)b.v ) ) - ASSIGN(>>=, v = (_v4_float)vec_sr( (_v4_int)v, (_v4_uint)b.v ) ) - - #undef ASSIGN + ASSIGN( =, v = b.v ) + ASSIGN( +=, v = (_v4_float)vec_add( (_v4_int)v, (_v4_int)b.v ) ) + ASSIGN( -=, v = (_v4_float)vec_sub( (_v4_int)v, (_v4_int)b.v ) ) + ASSIGN( + *=, + union { + int i[4]; + _v4_float v; + } t; + union { + int i[4]; + _v4_float v; + } u; + t.v = v; u.v = b.v; + v = ( _v4_float )( ( _v4_int ){t.i[0] * u.i[0], t.i[1] * u.i[1], + t.i[2] * u.i[2], + t.i[3] * u.i[3]} ) ) // FIXME: Sigh ... + ASSIGN( + /=, + union { + int i[4]; + _v4_float v; + } t; + union { + int i[4]; + _v4_float v; + } u; + t.v = v; u.v = b.v; + v = ( _v4_float )( ( _v4_int ){t.i[0] / u.i[0], t.i[1] / u.i[1], + t.i[2] / u.i[2], + t.i[3] / u.i[3]} ) ) // FIXME: Sigh ... + ASSIGN( + %=, + union { + int i[4]; + _v4_float v; + } t; + union { + int i[4]; + _v4_float v; + } u; + t.v = v; u.v = b.v; + v = ( _v4_float )( ( _v4_int ){t.i[0] % u.i[0], t.i[1] % u.i[1], + t.i[2] % u.i[2], + t.i[3] % u.i[3]} ) ) // FIXME: Sigh ... + ASSIGN( ^=, v = (_v4_float)vec_xor( (_v4_int)v, (_v4_int)b.v ) ) + ASSIGN( &=, v = (_v4_float)vec_and( (_v4_int)v, (_v4_int)b.v ) ) + ASSIGN( |=, v = (_v4_float)vec_or( (_v4_int)v, (_v4_int)b.v ) ) + ASSIGN( <<=, v = (_v4_float)vec_sl( (_v4_int)v, (_v4_uint)b.v ) ) + ASSIGN( >>=, v = (_v4_float)vec_sr( (_v4_int)v, (_v4_uint)b.v ) ) + +#undef ASSIGN // v4int member access operator - inline int &operator []( int n ) - { - return ( (int *) &v )[n]; - } + inline int& operator[]( int n ) { return ( (int*)&v )[n]; } - inline int operator ()( int n ) + inline int operator()( int n ) { - union - { - int i[4]; - _v4_float v; - } t; + union { + int i[4]; + _v4_float v; + } t; - t.v = v; + t.v = v; - return t.i[n]; + return t.i[n]; } - }; +}; - // v4int prefix unary operators +// v4int prefix unary operators - #define PREFIX_UNARY(op,instr) \ - inline v4int operator op( const v4int &a ) \ - { \ - v4int b; \ - instr; \ - return b; \ - } +#define PREFIX_UNARY( op, instr ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + instr; \ + return b; \ + } - PREFIX_UNARY( +, b.v = a.v ) - PREFIX_UNARY( -, b.v = (_v4_float) vec_sub( _false, (_v4_int) a.v ) ) - PREFIX_UNARY( !, b.v = (_v4_float) vec_cmpeq( _false, (_v4_int) a.v ) ) - PREFIX_UNARY( ~, b.v = (_v4_float) vec_xor( _true, (_v4_int) a.v ) ) +PREFIX_UNARY( +, b.v = a.v ) +PREFIX_UNARY( -, b.v = (_v4_float)vec_sub( _false, (_v4_int)a.v ) ) +PREFIX_UNARY( !, b.v = (_v4_float)vec_cmpeq( _false, (_v4_int)a.v ) ) +PREFIX_UNARY( ~, b.v = (_v4_float)vec_xor( _true, (_v4_int)a.v ) ) - #undef PREFIX_UNARY +#undef PREFIX_UNARY - // v4int prefix increment / decrement operators +// v4int prefix increment / decrement operators - inline v4int operator ++( v4int &a ) - { - _v4_float a_v = (_v4_float) vec_add( (_v4_int) a.v, _ione ); +inline v4int operator++( v4int& a ) +{ + _v4_float a_v = (_v4_float)vec_add( (_v4_int)a.v, _ione ); v4int b; @@ -695,11 +699,11 @@ namespace v4 b.v = a_v; return b; - } +} - inline v4int operator --( v4int &a ) - { - _v4_float a_v = (_v4_float) vec_sub( (_v4_int) a.v, _ione ); +inline v4int operator--( v4int& a ) +{ + _v4_float a_v = (_v4_float)vec_sub( (_v4_int)a.v, _ione ); v4int b; @@ -707,291 +711,341 @@ namespace v4 b.v = a_v; return b; - } +} - // v4int postfix increment / decrement operators +// v4int postfix increment / decrement operators - inline v4int operator ++( v4int &a, int ) - { +inline v4int operator++( v4int& a, int ) +{ _v4_float a_v = a.v; v4int b; - a.v = (_v4_float) vec_add( (_v4_int) a_v, _ione ); + a.v = (_v4_float)vec_add( (_v4_int)a_v, _ione ); b.v = a_v; return b; - } +} - inline v4int operator --( v4int &a, int ) - { +inline v4int operator--( v4int& a, int ) +{ _v4_float a_v = a.v; v4int b; - a.v = (_v4_float) vec_sub( (_v4_int) a_v, _ione ); + a.v = (_v4_float)vec_sub( (_v4_int)a_v, _ione ); b.v = a_v; return b; - } +} - // v4int binary operators +// v4int binary operators - #define BINARY(op,instr) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - instr; \ - return c; \ - } +#define BINARY( op, instr ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + instr; \ + return c; \ + } - BINARY(+, c.v = (_v4_float)vec_add( (_v4_int)a.v, (_v4_int) b.v ) ) - BINARY(-, c.v = (_v4_float)vec_sub( (_v4_int)a.v, (_v4_int) b.v ) ) - BINARY(*, union { int i[4]; _v4_float v; } t; - union { int i[4]; _v4_float v; } u; - t.v = a.v; u.v = b.v; - c.v = (_v4_float)((_v4_int){ t.i[0]*u.i[0], - t.i[1]*u.i[1], - t.i[2]*u.i[2], - t.i[3]*u.i[3] }) ) // FIXME: Sigh ... - BINARY(/, union { int i[4]; _v4_float v; } t; - union { int i[4]; _v4_float v; } u; - t.v = a.v; u.v = b.v; - c.v = (_v4_float)((_v4_int){ t.i[0]/u.i[0], - t.i[1]/u.i[1], - t.i[2]/u.i[2], - t.i[3]/u.i[3] }) ) // FIXME: Sigh ... - BINARY(%, union { int i[4]; _v4_float v; } t; - union { int i[4]; _v4_float v; } u; - t.v = a.v; u.v = b.v; - c.v = (_v4_float)((_v4_int){ t.i[0]%u.i[0], - t.i[1]%u.i[1], - t.i[2]%u.i[2], - t.i[3]%u.i[3] }) ) // FIXME: Sigh ... - BINARY(^, c.v = (_v4_float)vec_xor( (_v4_int)a.v, (_v4_int) b.v ) ) - BINARY(&, c.v = (_v4_float)vec_and( (_v4_int)a.v, (_v4_int) b.v ) ) - BINARY(|, c.v = (_v4_float)vec_or( (_v4_int)a.v, (_v4_int) b.v ) ) - BINARY(<<, c.v = (_v4_float)vec_sl( (_v4_int)a.v, (_v4_uint)b.v ) ) - BINARY(>>, c.v = (_v4_float)vec_sr( (_v4_int)a.v, (_v4_uint)b.v ) ) - - #undef BINARY - - // v4int logical operators - - #define LOGICAL(op,instr) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - instr; \ - return c; \ - } +BINARY( +, c.v = (_v4_float)vec_add( (_v4_int)a.v, (_v4_int)b.v ) ) +BINARY( -, c.v = (_v4_float)vec_sub( (_v4_int)a.v, (_v4_int)b.v ) ) +BINARY( + *, + union { + int i[4]; + _v4_float v; + } t; + union { + int i[4]; + _v4_float v; + } u; + t.v = a.v; u.v = b.v; + c.v = ( _v4_float )( ( _v4_int ){t.i[0] * u.i[0], t.i[1] * u.i[1], + t.i[2] * u.i[2], + t.i[3] * u.i[3]} ) ) // FIXME: Sigh ... +BINARY( + /, + union { + int i[4]; + _v4_float v; + } t; + union { + int i[4]; + _v4_float v; + } u; + t.v = a.v; u.v = b.v; c.v = ( _v4_float )( ( _v4_int ){ + t.i[0] / u.i[0], t.i[1] / u.i[1], t.i[2] / u.i[2], + t.i[3] / u.i[3]} ) ) // FIXME: Sigh ... +BINARY( + %, + union { + int i[4]; + _v4_float v; + } t; + union { + int i[4]; + _v4_float v; + } u; + t.v = a.v; u.v = b.v; c.v = ( _v4_float )( ( _v4_int ){ + t.i[0] % u.i[0], t.i[1] % u.i[1], t.i[2] % u.i[2], + t.i[3] % u.i[3]} ) ) // FIXME: Sigh ... +BINARY( ^, c.v = (_v4_float)vec_xor( (_v4_int)a.v, (_v4_int)b.v ) ) +BINARY( &, c.v = (_v4_float)vec_and( (_v4_int)a.v, (_v4_int)b.v ) ) +BINARY( |, c.v = (_v4_float)vec_or( (_v4_int)a.v, (_v4_int)b.v ) ) +BINARY( <<, c.v = (_v4_float)vec_sl( (_v4_int)a.v, (_v4_uint)b.v ) ) +BINARY( >>, c.v = (_v4_float)vec_sr( (_v4_int)a.v, (_v4_uint)b.v ) ) + +#undef BINARY + +// v4int logical operators + +#define LOGICAL( op, instr ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + instr; \ + return c; \ + } - LOGICAL(<, c.v = (_v4_float)vec_cmplt( (_v4_int)a.v, (_v4_int)b.v ) ) - LOGICAL(>, c.v = (_v4_float)vec_cmpgt( (_v4_int)a.v, (_v4_int)b.v ) ) - LOGICAL(==, c.v = (_v4_float)vec_cmpeq( (_v4_int)a.v, (_v4_int)b.v ) ) - LOGICAL(!=, c.v = (_v4_float)vec_xor( _true, vec_cmpeq( (_v4_int)a.v, - (_v4_int)b.v ) ) ) - LOGICAL(<=, c.v = (_v4_float)vec_xor( _true, vec_cmpgt( (_v4_int)a.v, - (_v4_int)b.v ) ) ) - LOGICAL(>=, c.v = (_v4_float)vec_xor( _true, vec_cmplt( (_v4_int)a.v, - (_v4_int)b.v ) ) ) - LOGICAL(&&, c.v = (_v4_float)vec_xor( _true, - vec_or( vec_cmpeq( (_v4_int)a.v, - _false ), - vec_cmpeq( (_v4_int)b.v, - _false ) ) ) ) - LOGICAL(||, c.v = (_v4_float)vec_xor( _true, - vec_and( vec_cmpeq( (_v4_int)a.v, - _false ), - vec_cmpeq( (_v4_int)b.v, - _false ) ) ) ) - - #undef LOGICAL - - // v4int miscellaneous functions - - inline v4int abs( const v4int &a ) - { +LOGICAL( <, c.v = (_v4_float)vec_cmplt( (_v4_int)a.v, (_v4_int)b.v ) ) +LOGICAL( >, c.v = (_v4_float)vec_cmpgt( (_v4_int)a.v, (_v4_int)b.v ) ) +LOGICAL( ==, c.v = (_v4_float)vec_cmpeq( (_v4_int)a.v, (_v4_int)b.v ) ) +LOGICAL( !=, c.v = (_v4_float)vec_xor( _true, vec_cmpeq( (_v4_int)a.v, + (_v4_int)b.v ) ) ) +LOGICAL( <=, c.v = (_v4_float)vec_xor( _true, vec_cmpgt( (_v4_int)a.v, + (_v4_int)b.v ) ) ) +LOGICAL( >=, c.v = (_v4_float)vec_xor( _true, vec_cmplt( (_v4_int)a.v, + (_v4_int)b.v ) ) ) +LOGICAL( &&, c.v = (_v4_float) + vec_xor( _true, vec_or( vec_cmpeq( (_v4_int)a.v, _false ), + vec_cmpeq( (_v4_int)b.v, _false ) ) ) ) +LOGICAL( ||, c.v = (_v4_float)vec_xor( + _true, vec_and( vec_cmpeq( (_v4_int)a.v, _false ), + vec_cmpeq( (_v4_int)b.v, _false ) ) ) ) + +#undef LOGICAL + +// v4int miscellaneous functions + +inline v4int abs( const v4int& a ) +{ v4int b; - b.v = (_v4_float) vec_abs( (_v4_int) a.v ); + b.v = (_v4_float)vec_abs( (_v4_int)a.v ); return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; b.v = vec_andc( a.v, c.v ); return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.v = vec_and( a.v, c.v ); return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; - tf.v = vec_sel( f.v, t.v, (_v4_uint) c.v ); + tf.v = vec_sel( f.v, t.v, (_v4_uint)c.v ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - v = (_v4_float) { a, a, a, a }; + v = ( _v4_float ){a, a, a, a}; } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - v = (_v4_float) { f0, f1, f2, f3 }; + v = ( _v4_float ){f0, f1, f2, f3}; } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op,instr) \ - inline v4float &operator op( const v4float &b ) \ - { \ - instr; \ - return *this; \ +#define ASSIGN( op, instr ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + instr; \ + return *this; \ } - ASSIGN( =, v = b.v ); + ASSIGN( =, v = b.v ); ASSIGN( +=, v = vec_add( v, b.v ) ); ASSIGN( -=, v = vec_sub( v, b.v ) ); ASSIGN( *=, v = vec_madd( v, b.v, _zero ) ); - #undef ASSIGN +#undef ASSIGN // This does one NR iteration and is supposed to be accurate enough. - inline v4float &operator /=( const v4float &a ) + inline v4float& operator/=( const v4float& a ) { - _v4_float a_v = a.v, b_v; + _v4_float a_v = a.v, b_v; - // Compute an estimate of the reciprocal of a (??-bit accurate) + // Compute an estimate of the reciprocal of a (??-bit accurate) - b_v = vec_re( a_v ); + b_v = vec_re( a_v ); - // FIXME: CHECK NUMERICS ... HOW MANY N-R STEPS TO USE? APPLE'S - // ALTIVEC WEB PAGE SUGGESTS TWO STEPS AND GIVES THE BELOW - // IMPLEMENTATION FOR THE REFINEMENT. + // FIXME: CHECK NUMERICS ... HOW MANY N-R STEPS TO USE? APPLE'S + // ALTIVEC WEB PAGE SUGGESTS TWO STEPS AND GIVES THE BELOW + // IMPLEMENTATION FOR THE REFINEMENT. - // FIXME: IS THIS THE MOST ACCURATE FORM FOR THE REFINEMENT? - // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE - // ACCURATE (OR AT LEAST USES FEWER CONSTANTS). + // FIXME: IS THIS THE MOST ACCURATE FORM FOR THE REFINEMENT? + // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE + // ACCURATE (OR AT LEAST USES FEWER CONSTANTS). - b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); + b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); - // Compute n * refined( (1/a)_estimate ) to get result n/a + // Compute n * refined( (1/a)_estimate ) to get result n/a - v = vec_madd( v, b_v, _zero ); + v = vec_madd( v, b_v, _zero ); - return *this; + return *this; } - #if 0 +#if 0 // This is a more accurate version that does two NR iterations. inline v4float &operator /=( const v4float &a ) { @@ -1018,62 +1072,58 @@ namespace v4 return *this; } - #endif +#endif // v4float member access operator - inline float &operator []( int n ) - { - return ( (float *) &v )[n]; - } + inline float& operator[]( int n ) { return ( (float*)&v )[n]; } - inline float operator ()( int n ) + inline float operator()( int n ) { - union - { - float f[4]; - _v4_float v; - } t; + union { + float f[4]; + _v4_float v; + } t; - t.v = v; + t.v = v; - return t.f[n]; + return t.f[n]; } - }; +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.v = a.v; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; b.v = vec_sub( _zero, a.v ); return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; - b.v = (_v4_float) vec_cmpeq( a.v, _zero ); + b.v = (_v4_float)vec_cmpeq( a.v, _zero ); return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; _v4_float t = vec_add( a.v, _one ); @@ -1082,10 +1132,10 @@ namespace v4 b.v = t; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; _v4_float t = vec_sub( a.v, _one ); @@ -1094,12 +1144,12 @@ namespace v4 b.v = t; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; _v4_float a_v = a.v; @@ -1108,10 +1158,10 @@ namespace v4 b.v = a_v; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; _v4_float a_v = a.v; @@ -1120,26 +1170,26 @@ namespace v4 b.v = a_v; return b; - } +} - // v4float binary operators +// v4float binary operators - #define BINARY(op,instr) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - instr; \ - return c; \ - } +#define BINARY( op, instr ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + instr; \ + return c; \ + } - BINARY( +, c.v = vec_add( a.v, b.v ) ) - BINARY( -, c.v = vec_sub( a.v, b.v ) ) - BINARY( *, c.v = vec_madd( a.v, b.v, _zero ) ) +BINARY( +, c.v = vec_add( a.v, b.v ) ) +BINARY( -, c.v = vec_sub( a.v, b.v ) ) +BINARY( *, c.v = vec_madd( a.v, b.v, _zero ) ) - #undef BINARY +#undef BINARY - inline v4float operator /( const v4float &n, const v4float &a ) - { +inline v4float operator/( const v4float& n, const v4float& a ) +{ v4float c; _v4_float a_v = a.v, b_v; @@ -1159,13 +1209,13 @@ namespace v4 b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); // Compute n * refined( (1/a)_estimate ) to get result n/a - + c.v = vec_madd( n.v, b_v, _zero ); return c; - } +} - #if 0 +#if 0 // This is a more accurate version that does two NR iterations. inline v4float operator /( const v4float &n, const v4float &a ) { @@ -1194,85 +1244,91 @@ namespace v4 return c; } - #endif +#endif - // v4float logical operators +// v4float logical operators - #define LOGICAL(op,instr) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - instr; \ - return c; \ - } +#define LOGICAL( op, instr ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + instr; \ + return c; \ + } - LOGICAL( <, c.v = (_v4_float) vec_cmplt( a.v, b.v ) ) - LOGICAL( >, c.v = (_v4_float) vec_cmpgt( a.v, b.v ) ) - LOGICAL( ==, c.v = (_v4_float) vec_cmpeq( a.v, b.v ) ) - LOGICAL( <=, c.v = (_v4_float) vec_cmple( a.v, b.v ) ) - LOGICAL( >=, c.v = (_v4_float) vec_cmpge( a.v, b.v ) ) - LOGICAL( !=, c.v = (_v4_float) vec_xor( vec_cmpeq( a.v, b.v ), - _true ) ) - LOGICAL( &&, c.v = (_v4_float) vec_xor( vec_or( vec_cmpeq( a.v, _zero ), - vec_cmpeq( b.v, _zero ) ), - _true ) ) - LOGICAL( ||, c.v = (_v4_float) vec_xor( vec_and( vec_cmpeq( a.v, _zero ), - vec_cmpeq( b.v, _zero ) ), - _true ) ) - - #undef LOGICAL - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - union { float f[4]; _v4_float v; } t; \ - v4float b; \ - t.v = a.v; \ - b.v = (_v4_float){ (float) ::fn( t.f[0] ), \ - (float) ::fn( t.f[1] ), \ - (float) ::fn( t.f[2] ), \ - (float) ::fn( t.f[3] ) }; \ - return b; \ - } +LOGICAL( <, c.v = (_v4_float)vec_cmplt( a.v, b.v ) ) +LOGICAL( >, c.v = (_v4_float)vec_cmpgt( a.v, b.v ) ) +LOGICAL( ==, c.v = (_v4_float)vec_cmpeq( a.v, b.v ) ) +LOGICAL( <=, c.v = (_v4_float)vec_cmple( a.v, b.v ) ) +LOGICAL( >=, c.v = (_v4_float)vec_cmpge( a.v, b.v ) ) +LOGICAL( !=, c.v = (_v4_float)vec_xor( vec_cmpeq( a.v, b.v ), _true ) ) +LOGICAL( &&, c.v = (_v4_float)vec_xor( vec_or( vec_cmpeq( a.v, _zero ), + vec_cmpeq( b.v, _zero ) ), + _true ) ) +LOGICAL( ||, c.v = (_v4_float)vec_xor( vec_and( vec_cmpeq( a.v, _zero ), + vec_cmpeq( b.v, _zero ) ), + _true ) ) + +#undef LOGICAL + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + union { \ + float f[4]; \ + _v4_float v; \ + } t; \ + v4float b; \ + t.v = a.v; \ + b.v = ( _v4_float ){(float)::fn( t.f[0] ), (float)::fn( t.f[1] ), \ + (float)::fn( t.f[2] ), (float)::fn( t.f[3] )}; \ + return b; \ + } - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - union { float f[4]; _v4_float v; } t; \ - union { float f[4]; _v4_float v; } u; \ - v4float c; \ - t.v = a.v; \ - u.v = b.v; \ - c.v = (_v4_float){ (float) ::fn( t.f[0], u.f[0] ), \ - (float) ::fn( t.f[1], u.f[1] ), \ - (float) ::fn( t.f[2], u.f[2] ), \ - (float) ::fn( t.f[3], u.f[3] ) }; \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + union { \ + float f[4]; \ + _v4_float v; \ + } t; \ + union { \ + float f[4]; \ + _v4_float v; \ + } u; \ + v4float c; \ + t.v = a.v; \ + u.v = b.v; \ + c.v = ( _v4_float ){ \ + (float)::fn( t.f[0], u.f[0] ), (float)::fn( t.f[1], u.f[1] ), \ + (float)::fn( t.f[2], u.f[2] ), (float)::fn( t.f[3], u.f[3] )}; \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) - #undef CMATH_FR1 - #undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - inline v4float fabs( const v4float &a ) - { + inline v4float fabs( const v4float& a ) +{ v4float b; b.v = vec_andc( a.v, _sign ); return b; - } +} - // This version does one NR iteration and is supposed to be accurate enough. - inline v4float sqrt( const v4float &a ) - { +// This version does one NR iteration and is supposed to be accurate enough. +inline v4float sqrt( const v4float& a ) +{ v4float b; _v4_float a_v = a.v, b_v; @@ -1285,17 +1341,16 @@ namespace v4 // APPLE'S ALTIVEC PAGE SUGGESTS TWO. b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), - vec_madd( b_v, _half, _zero ), - b_v ); + vec_madd( b_v, _half, _zero ), b_v ); // Compute the sqrt(a) via a*refined_rsqrt_estimate(a) ~ sqrt(a) b.v = vec_madd( a_v, b_v, _zero ); return b; - } +} - #if 0 +#if 0 // This is a more accurate version that does two NR iterations. inline v4float sqrt( const v4float &a ) { @@ -1323,31 +1378,31 @@ namespace v4 return b; } - #endif +#endif - inline v4float copysign( const v4float &a, const v4float &b ) - { +inline v4float copysign( const v4float& a, const v4float& b ) +{ v4float c; c.v = vec_or( vec_andc( a.v, _sign ), vec_and( b.v, _sign ) ); return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.v = vec_rsqrte( a.v ); return b; - } +} - // This version does one NR iteration and is supposed to be accurate enough. - inline v4float rsqrt( const v4float &a ) - { +// This version does one NR iteration and is supposed to be accurate enough. +inline v4float rsqrt( const v4float& a ) +{ v4float b; _v4_float a_v = a.v, b_v; @@ -1364,13 +1419,12 @@ namespace v4 // b_v ); b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ), - vec_madd( b_v, _half, _zero ), - b_v ); + vec_madd( b_v, _half, _zero ), b_v ); return b; - } +} - #if 0 +#if 0 // This is a more accurate version that does two NR iterations. inline v4float rsqrt( const v4float &a ) { @@ -1395,20 +1449,20 @@ namespace v4 return b; } - #endif +#endif - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.v = vec_re( a.v ); return b; - } +} - // This version does one NR iteration and is supposed to be accurate enough. - inline v4float rcp( const v4float &a ) - { +// This version does one NR iteration and is supposed to be accurate enough. +inline v4float rcp( const v4float& a ) +{ v4float b; _v4_float a_v = a.v, b_v; @@ -1430,9 +1484,9 @@ namespace v4 b.v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v ); return b; - } +} - #if 0 +#if 0 // This is a more accurate version that does two NR iterations. inline v4float rcp( const v4float &a ) { @@ -1458,108 +1512,104 @@ namespace v4 return b; } - #endif +#endif - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = vec_madd( a.v, b.v, c.v ); return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; // d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ... - d.v = vec_msub( a.v, b.v, c.v ) ; + d.v = vec_msub( a.v, b.v, c.v ); return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = vec_nmsub( a.v, b.v, c.v ); return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = vec_andc( a.v, m.v ); return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = vec_or( a.v, m.v ); return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = vec_xor( a.v, m.v ); return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ vec_st( vec_add( vec_ld( 0, p ), a.v ), 0, p ); - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ vec_st( vec_sub( vec_ld( 0, p ), a.v ), 0, p ); - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ vec_st( vec_madd( vec_ld( 0, p ), a.v, _zero ), 0, p ); - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ _v4_float z = wl.v, xy; xy = vec_add( _one, vec_xor( _n02, vec_mergeh( z, z ) ) ); - z = vec_add( _one, vec_xor( _n02, vec_splat( z, 2 ) ) ); + z = vec_add( _one, vec_xor( _n02, vec_splat( z, 2 ) ) ); - xy = vec_madd( vec_perm( xy, xy, _PERM(0,1,0,1) ), - vec_mergel( xy, xy ), - _zero ); + xy = vec_madd( vec_perm( xy, xy, _PERM( 0, 1, 0, 1 ) ), + vec_mergel( xy, xy ), _zero ); wl.v = vec_madd( xy, vec_splat( z, 0 ), _zero ); wh.v = vec_madd( xy, vec_splat( z, 1 ), _zero ); - } +} - #undef _v4_int - #undef _v4_uint - #undef _v4_float - #undef _v16_uchar +#undef _v4_int +#undef _v4_uint +#undef _v4_float +#undef _v16_uchar - #undef _PERM +#undef _PERM } // namespace v4 diff --git a/src/util/v4/v4_avx.h b/src/util/v4/v4_avx.h index 29612f45..9c0a3160 100644 --- a/src/util/v4/v4_avx.h +++ b/src/util/v4/v4_avx.h @@ -5,194 +5,186 @@ #error "Do not include v4_avx.h directly; use v4.h" #endif -#include #include +#include #define V4_ACCELERATION #define V4_AVX_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - template - struct permute - { - constexpr static int value = i0 + i1*4 + i2*16 + i3*64; - }; +template +struct permute +{ + constexpr static int value = i0 + i1 * 4 + i2 * 16 + i3 * 64; +}; - #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) +#define PERM( i0, i1, i2, i3 ) ( ( i0 ) + (i1)*4 + (i2)*16 + (i3)*64 ) - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; - __m128 v; + union { + int i[4]; + float f[4]; + __m128 v; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; - b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1, 1, 1, 1>::value ) ); return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; - b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); return b; - } +} - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ __m128 t = a.v; a.v = b.v; b.v = t; - } +} - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u; - t = _mm_unpackhi_ps( a0_v, a1_v ); + t = _mm_unpackhi_ps( a0_v, a1_v ); a0_v = _mm_unpacklo_ps( a0_v, a1_v ); - u = _mm_unpackhi_ps( a2_v, a3_v ); + u = _mm_unpackhi_ps( a2_v, a3_v ); a2_v = _mm_unpacklo_ps( a2_v, a3_v ); a1_v = _mm_movehl_ps( a2_v, a0_v ); @@ -204,206 +196,172 @@ namespace v4 a1.v = a1_v; a2.v = a2_v; a3.v = a3_v; - } - - // v4 memory manipulation functions - - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.v = _mm_load_ps( ( float * ) p ); - } - - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - _mm_store_ps( ( float * ) p, a.v ); - } - - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - _mm_stream_ps( ( float * ) p, a.v ); - } - - inline void clear_4x1( void * ALIGNED(16) p ) - { - _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); - } - - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); - } - - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { - __m128 t = _mm_load_ps( ( float * ) a ); - - _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); - _mm_store_ps( ( float * ) b, t ); - } - - // v4 transposed memory manipulation functions - - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.v = _mm_setr_ps( ( (const float *) a0 )[0], - ( (const float *) a1 )[0], - ( (const float *) a2 )[0], - ( (const float *) a3 )[0] ); - } - - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { +} + +// v4 memory manipulation functions + +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.v = _mm_load_ps( (float*)p ); +} + +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + _mm_store_ps( (float*)p, a.v ); +} + +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + _mm_stream_ps( (float*)p, a.v ); +} + +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + _mm_store_ps( (float*)p, _mm_setzero_ps() ); +} + +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + _mm_store_ps( (float*)dst, _mm_load_ps( (const float*)src ) ); +} + +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ + __m128 t = _mm_load_ps( (float*)a ); + + _mm_store_ps( (float*)a, _mm_load_ps( (float*)b ) ); + _mm_store_ps( (float*)b, t ); +} + +// v4 transposed memory manipulation functions + +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.v = _mm_setr_ps( ( (const float*)a0 )[0], ( (const float*)a1 )[0], + ( (const float*)a2 )[0], ( (const float*)a3 )[0] ); +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ __m128 a_v, b_v, t; b_v = _mm_setzero_ps(); - t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 ); - b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 ); + t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64*)a0 ), (__m64*)a1 ); + b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64*)a2 ), (__m64*)a3 ); a.v = _mm_shuffle_ps( t, b_v, 0x88 ); b.v = _mm_shuffle_ps( t, b_v, 0xdd ); - } - - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { +} + +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ __m128 r, s, t, u, d_v; - a.v = _mm_load_ps( (const float *) a0 ); - b.v = _mm_load_ps( (const float *) a1 ); - c.v = _mm_load_ps( (const float *) a2 ); - d_v = _mm_load_ps( (const float *) a3 ); + a.v = _mm_load_ps( (const float*)a0 ); + b.v = _mm_load_ps( (const float*)a1 ); + c.v = _mm_load_ps( (const float*)a2 ); + d_v = _mm_load_ps( (const float*)a3 ); - r = _mm_unpacklo_ps( a.v, b.v ); - s = _mm_unpackhi_ps( a.v, b.v ); + r = _mm_unpacklo_ps( a.v, b.v ); + s = _mm_unpackhi_ps( a.v, b.v ); - t = _mm_unpacklo_ps( c.v, d_v ); - u = _mm_unpackhi_ps( c.v, d_v ); + t = _mm_unpacklo_ps( c.v, d_v ); + u = _mm_unpackhi_ps( c.v, d_v ); a.v = _mm_movelh_ps( r, t ); b.v = _mm_movehl_ps( t, r ); c.v = _mm_movelh_ps( s, u ); - } - - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { +} + +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ __m128 r, s, t, u; - a.v = _mm_load_ps( (const float *) a0 ); - b.v = _mm_load_ps( (const float *) a1 ); - c.v = _mm_load_ps( (const float *) a2 ); - d.v = _mm_load_ps( (const float *) a3 ); + a.v = _mm_load_ps( (const float*)a0 ); + b.v = _mm_load_ps( (const float*)a1 ); + c.v = _mm_load_ps( (const float*)a2 ); + d.v = _mm_load_ps( (const float*)a3 ); - r = _mm_unpackhi_ps( a.v, b.v ); - s = _mm_unpacklo_ps( a.v, b.v ); + r = _mm_unpackhi_ps( a.v, b.v ); + s = _mm_unpacklo_ps( a.v, b.v ); - t = _mm_unpackhi_ps( c.v, d.v ); - u = _mm_unpacklo_ps( c.v, d.v ); + t = _mm_unpackhi_ps( c.v, d.v ); + u = _mm_unpacklo_ps( c.v, d.v ); a.v = _mm_movelh_ps( s, u ); b.v = _mm_movehl_ps( u, s ); c.v = _mm_movelh_ps( r, t ); d.v = _mm_movehl_ps( t, r ); - } - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (float *) a0 )[0] = a.f[0]; - ( (float *) a1 )[0] = a.f[1]; - ( (float *) a2 )[0] = a.f[2]; - ( (float *) a3 )[0] = a.f[3]; - } - - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { +} + +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (float*)a0 )[0] = a.f[0]; + ( (float*)a1 )[0] = a.f[1]; + ( (float*)a2 )[0] = a.f[2]; + ( (float*)a3 )[0] = a.f[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ __m128 t; - t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64*)a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64*)a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 - } + _mm_storel_pi( (__m64*)a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64*)a3, t ); // a3 b3 -> a3 +} - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ __m128 t; - t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64*)a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64*)a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 + _mm_storel_pi( (__m64*)a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64*)a3, t ); // a3 b3 -> a3 - ( (float *) a0 )[2] = c.f[0]; - ( (float *) a1 )[2] = c.f[1]; - ( (float *) a2 )[2] = c.f[2]; - ( (float *) a3 )[2] = c.f[3]; - } + ( (float*)a0 )[2] = c.f[0]; + ( (float*)a1 )[2] = c.f[1]; + ( (float*)a2 )[2] = c.f[2]; + ( (float*)a3 )[2] = c.f[3]; +} - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ __m128 a_v, b_v, c_v, d_v, t, u; - t = _mm_unpackhi_ps( a.v, b.v ); + t = _mm_unpackhi_ps( a.v, b.v ); a_v = _mm_unpacklo_ps( a.v, b.v ); - u = _mm_unpackhi_ps( c.v, d.v ); + u = _mm_unpackhi_ps( c.v, d.v ); c_v = _mm_unpacklo_ps( c.v, d.v ); b_v = _mm_movehl_ps( c_v, a_v ); @@ -411,241 +369,259 @@ namespace v4 c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - _mm_store_ps( (float *) a0, a_v ); - _mm_store_ps( (float *) a1, b_v ); - _mm_store_ps( (float *) a2, c_v ); - _mm_store_ps( (float *) a3, d_v ); - } + _mm_store_ps( (float*)a0, a_v ); + _mm_store_ps( (float*)a1, b_v ); + _mm_store_ps( (float*)a2, c_v ); + _mm_store_ps( (float*)a3, d_v ); +} - ////////////// - // v4int class +////////////// +// v4int class - class v4int : public v4 - { +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; + union { + int i; + float f; + } u; - u.i = a; - v = _mm_set1_ps( u.f ); + u.i = a; + v = _mm_set1_ps( u.f ); } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - union - { - int i; - float f; - } u0, u1, u2, u3; + union { + int i; + float f; + } u0, u1, u2, u3; - u0.i = i0; - u1.i = i1; - u2.i = i2; - u3.i = i3; + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; - v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); + v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) - #undef ASSIGN +#undef ASSIGN - inline v4int &operator =( const v4int &b ) + inline v4int& operator=( const v4int& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } - inline v4int &operator ^=( const v4int &b ) + inline v4int& operator^=( const v4int& b ) { - v = _mm_xor_ps( v, b.v ); + v = _mm_xor_ps( v, b.v ); - return *this; + return *this; } - inline v4int &operator &=( const v4int &b ) + inline v4int& operator&=( const v4int& b ) { - v = _mm_and_ps( v, b.v ); + v = _mm_and_ps( v, b.v ); - return *this; + return *this; } - inline v4int &operator |=( const v4int &b ) + inline v4int& operator|=( const v4int& b ) { - v = _mm_or_ps( v, b.v ); + v = _mm_or_ps( v, b.v ); - return *this; + return *this; } // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; + inline int operator()( int n ) { return i[n]; } +}; + +// v4int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ } - }; - - // v4int prefix unary operators - - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int &a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - - inline v4int operator +( const v4int &a ) - { + +inline v4int operator+( const v4int& a ) +{ v4int b; b.v = a.v; return b; - } +} - PREFIX_UNARY(-) +PREFIX_UNARY( -) - inline v4int operator !( const v4int &a ) - { +inline v4int operator!( const v4int& a ) +{ v4int b; - b.i[0] = - ( ! a.i[0] ); - b.i[1] = - ( ! a.i[1] ); - b.i[2] = - ( ! a.i[2] ); - b.i[3] = - ( ! a.i[3] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); return b; - } +} - inline v4int operator ~( const v4int &a ) - { +inline v4int operator~( const v4int& a ) +{ v4int b; - union - { - int i; - float f; + union { + int i; + float f; } u; u.i = -1; @@ -653,124 +629,124 @@ namespace v4 b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) ); return b; - } - - #undef PREFIX_UNARY - - // v4int prefix increment / decrement - - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int &a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - - #undef PREFIX_INCDEC - - // v4int postfix increment / decrement - - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int &a, int ) \ - { \ - v4int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - - #undef POSTFIX_INCDEC - - // v4int binary operators - - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) - - #undef BINARY - - inline v4int operator ^( const v4int &a, const v4int &b ) - { +} + +#undef PREFIX_UNARY + +// v4int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v4int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v4int binary operators + +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +inline v4int operator^( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_xor_ps( a.v, b.v ); return c; - } +} - inline v4int operator &( const v4int &a, const v4int &b ) - { +inline v4int operator&( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_and_ps( a.v, b.v ); return c; - } +} - inline v4int operator |( const v4int &a, const v4int &b ) - { +inline v4int operator|( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_or_ps( a.v, b.v ); return c; - } - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4int miscellaneous functions - - inline v4int abs( const v4int &a ) - { +} + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4int miscellaneous functions + +inline v4int abs( const v4int& a ) +{ v4int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -779,148 +755,182 @@ namespace v4 b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; b.v = _mm_andnot_ps( c.v, a.v ); return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.v = _mm_and_ps( c.v, a.v ); return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; __m128 c_v = c.v; - tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), - _mm_and_ps( c_v, t.v ) ); + tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), _mm_and_ps( c_v, t.v ) ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - v = _mm_set1_ps( a ); + v = _mm_set1_ps( a ); } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - v = _mm_setr_ps( f0, f1, f2, f3 ); + v = _mm_setr_ps( f0, f1, f2, f3 ); } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op,intrin) \ - inline v4float &operator op( const v4float &b ) \ - { \ - v = intrin( v, b.v ); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } ASSIGN( +=, _mm_add_ps ) @@ -928,61 +938,55 @@ namespace v4 ASSIGN( *=, _mm_mul_ps ) ASSIGN( /=, _mm_div_ps ) - #undef ASSIGN +#undef ASSIGN - inline v4float &operator =( const v4float &b ) + inline v4float& operator=( const v4float& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.v = a.v; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); @@ -991,10 +995,10 @@ namespace v4 b.v = t; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1003,12 +1007,12 @@ namespace v4 b.v = t; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; __m128 a_v = a.v; @@ -1017,10 +1021,10 @@ namespace v4 b.v = a_v; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; __m128 a_v = a.v; @@ -1029,175 +1033,171 @@ namespace v4 b.v = a_v; return b; - } - - // v4float binary operators - - #define BINARY(op,intrin) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } - - BINARY( +, _mm_add_ps ) - BINARY( -, _mm_sub_ps ) - BINARY( *, _mm_mul_ps ) - BINARY( /, _mm_div_ps ) - - #undef BINARY - - // v4float logical operators - - #define LOGICAL(op,intrin) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } - - LOGICAL( <, _mm_cmplt_ps ) - LOGICAL( >, _mm_cmpgt_ps ) - LOGICAL( ==, _mm_cmpeq_ps ) - LOGICAL( <=, _mm_cmple_ps ) - LOGICAL( >=, _mm_cmpge_ps ) - LOGICAL( !=, _mm_cmpneq_ps ) - - #undef LOGICAL - - inline v4int operator &&( const v4float &a, const v4float &b ) - { +} + +// v4float binary operators + +#define BINARY( op, intrin ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } + +BINARY( +, _mm_add_ps ) +BINARY( -, _mm_sub_ps ) +BINARY( *, _mm_mul_ps ) +BINARY( /, _mm_div_ps ) + +#undef BINARY + +// v4float logical operators + +#define LOGICAL( op, intrin ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } + +LOGICAL( <, _mm_cmplt_ps ) +LOGICAL( >, _mm_cmpgt_ps ) +LOGICAL( ==, _mm_cmpeq_ps ) +LOGICAL( <=, _mm_cmple_ps ) +LOGICAL( >=, _mm_cmpge_ps ) +LOGICAL( !=, _mm_cmpneq_ps ) + +#undef LOGICAL + +inline v4int operator&&( const v4float& a, const v4float& b ) +{ v4int c; __m128 vzero = _mm_setzero_ps(); - c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + c.v = + _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), _mm_cmpneq_ps( b.v, vzero ) ); return c; - } +} - inline v4int operator ||( const v4float &a, const v4float &b ) - { +inline v4int operator||( const v4float& a, const v4float& b ) +{ v4int c; __m128 vzero = _mm_setzero_ps(); - c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), _mm_cmpneq_ps( b.v, vzero ) ); return c; - } - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - return b; \ - } - - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) - - #undef CMATH_FR1 - #undef CMATH_FR2 - - inline v4float fabs( const v4float &a ) - { +} + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) + +#undef CMATH_FR1 +#undef CMATH_FR2 + + inline v4float fabs( const v4float& a ) +{ v4float b; b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v ); return b; - } +} - inline v4float sqrt( const v4float &a ) - { +inline v4float sqrt( const v4float& a ) +{ v4float b; b.v = _mm_sqrt_ps( a.v ); return b; - } +} - inline v4float copysign( const v4float &a, const v4float &b ) - { +inline v4float copysign( const v4float& a, const v4float& b ) +{ v4float c; __m128 t = _mm_set1_ps( -0.0f ); - c.v = _mm_or_ps( _mm_and_ps( t, b.v ), - _mm_andnot_ps( t, a.v ) ); + c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) ); return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.v = _mm_rsqrt_ps( a.v ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; __m128 a_v = a.v, b_v; b_v = _mm_rsqrt_ps( a_v ); - b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ), - _mm_sub_ps( b_v, - _mm_mul_ps( a_v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) - ) - ) - ) - ) - ); + b.v = _mm_add_ps( + b_v, + _mm_mul_ps( + _mm_set1_ps( 0.5f ), + _mm_sub_ps( + b_v, _mm_mul_ps( + a_v, _mm_mul_ps( b_v, _mm_mul_ps( b_v, b_v ) ) ) ) ) ); return b; - } +} - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.v = _mm_rcp_ps( a.v ); return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; __m128 a_v = a.v, b_v; @@ -1205,118 +1205,106 @@ namespace v4 b_v = _mm_rcp_ps( a_v ); b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ), - _mm_mul_ps( a_v, - _mm_mul_ps( b_v, b_v ) - ) - ); + _mm_mul_ps( a_v, _mm_mul_ps( b_v, b_v ) ) ); return b; - } +} - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v ); return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v ); return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) ); return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_andnot_ps( m.v, a.v ); return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_or_ps( m.v, a.v ); return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_xor_ps( m.v, a.v ); return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ __m128 l = _mm_set1_ps( 1.0f ); __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); __m128 z = wl.v, xy; - xy = _mm_add_ps( l, - _mm_xor_ps( s, - _mm_shuffle_ps( z, z, PERM(0,0,1,1) ) - ) - ); + xy = _mm_add_ps( + l, _mm_xor_ps( s, _mm_shuffle_ps( z, z, PERM( 0, 0, 1, 1 ) ) ) ); - z = _mm_add_ps( l, - _mm_xor_ps( s, - _mm_shuffle_ps( z, z, PERM(2,2,2,2) ) - ) - ); + z = _mm_add_ps( + l, _mm_xor_ps( s, _mm_shuffle_ps( z, z, PERM( 2, 2, 2, 2 ) ) ) ); - xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ), - _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) ); + xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM( 0, 1, 0, 1 ) ), + _mm_shuffle_ps( xy, xy, PERM( 2, 2, 3, 3 ) ) ); - wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) ); + wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM( 0, 0, 0, 0 ) ) ); - wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) ); - } + wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM( 1, 1, 1, 1 ) ) ); +} - #undef PERM +#undef PERM } // namespace v4 diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h index 2cab8b9c..ebd54717 100644 --- a/src/util/v4/v4_avx2.h +++ b/src/util/v4/v4_avx2.h @@ -12,187 +12,179 @@ #define V4_AVX2_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - template - struct permute - { - constexpr static int value = i0 + i1*4 + i2*16 + i3*64; - }; +template +struct permute +{ + constexpr static int value = i0 + i1 * 4 + i2 * 16 + i3 * 64; +}; - #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) +#define PERM( i0, i1, i2, i3 ) ( ( i0 ) + (i1)*4 + (i2)*16 + (i3)*64 ) - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; - __m128 v; + union { + int i[4]; + float f[4]; + __m128 v; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; - b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1, 1, 1, 1>::value ) ); return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; - b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); return b; - } +} - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ __m128 t = a.v; a.v = b.v; b.v = t; - } +} - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u; - t = _mm_unpackhi_ps( a0_v, a1_v ); + t = _mm_unpackhi_ps( a0_v, a1_v ); a0_v = _mm_unpacklo_ps( a0_v, a1_v ); - u = _mm_unpackhi_ps( a2_v, a3_v ); + u = _mm_unpackhi_ps( a2_v, a3_v ); a2_v = _mm_unpacklo_ps( a2_v, a3_v ); a1_v = _mm_movehl_ps( a2_v, a0_v ); @@ -204,107 +196,92 @@ namespace v4 a1.v = a1_v; a2.v = a2_v; a3.v = a3_v; - } - - // v4 memory manipulation functions +} - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.v = _mm_load_ps( ( float * ) p ); - } +// v4 memory manipulation functions - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - _mm_store_ps( ( float * ) p, a.v ); - } +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.v = _mm_load_ps( (float*)p ); +} - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - _mm_stream_ps( ( float * ) p, a.v ); - } +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + _mm_store_ps( (float*)p, a.v ); +} - inline void clear_4x1( void * ALIGNED(16) p ) - { - _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); - } +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + _mm_stream_ps( (float*)p, a.v ); +} - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); - } +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + _mm_store_ps( (float*)p, _mm_setzero_ps() ); +} - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { - __m128 t = _mm_load_ps( ( float * ) a ); +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + _mm_store_ps( (float*)dst, _mm_load_ps( (const float*)src ) ); +} - _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); - _mm_store_ps( ( float * ) b, t ); - } +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ + __m128 t = _mm_load_ps( (float*)a ); - // v4 transposed memory manipulation functions + _mm_store_ps( (float*)a, _mm_load_ps( (float*)b ) ); + _mm_store_ps( (float*)b, t ); +} - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.v = _mm_setr_ps( ( (const float *) a0 )[0], - ( (const float *) a1 )[0], - ( (const float *) a2 )[0], - ( (const float *) a3 )[0] ); - } +// v4 transposed memory manipulation functions - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.v = _mm_setr_ps( ( (const float*)a0 )[0], ( (const float*)a1 )[0], + ( (const float*)a2 )[0], ( (const float*)a3 )[0] ); +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ __m128 a_v, b_v, t; b_v = _mm_setzero_ps(); - t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 ); - b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 ); + t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64*)a0 ), (__m64*)a1 ); + b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64*)a2 ), (__m64*)a3 ); a.v = _mm_shuffle_ps( t, b_v, 0x88 ); b.v = _mm_shuffle_ps( t, b_v, 0xdd ); - } +} - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ __m128 r, s, t, u, d_v; - a.v = _mm_load_ps( (const float *) a0 ); - b.v = _mm_load_ps( (const float *) a1 ); - c.v = _mm_load_ps( (const float *) a2 ); - d_v = _mm_load_ps( (const float *) a3 ); + a.v = _mm_load_ps( (const float*)a0 ); + b.v = _mm_load_ps( (const float*)a1 ); + c.v = _mm_load_ps( (const float*)a2 ); + d_v = _mm_load_ps( (const float*)a3 ); - r = _mm_unpacklo_ps( a.v, b.v ); - s = _mm_unpackhi_ps( a.v, b.v ); + r = _mm_unpacklo_ps( a.v, b.v ); + s = _mm_unpackhi_ps( a.v, b.v ); - t = _mm_unpacklo_ps( c.v, d_v ); - u = _mm_unpackhi_ps( c.v, d_v ); + t = _mm_unpacklo_ps( c.v, d_v ); + u = _mm_unpackhi_ps( c.v, d_v ); a.v = _mm_movelh_ps( r, t ); b.v = _mm_movehl_ps( t, r ); c.v = _mm_movelh_ps( s, u ); - } +} - #if 0 +#if 0 inline void load_4x3_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -333,37 +310,34 @@ namespace v4 b.v = b_v; c.v = c_v; } - #endif +#endif - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ __m128 r, s, t, u; - a.v = _mm_load_ps( (const float *) a0 ); - b.v = _mm_load_ps( (const float *) a1 ); - c.v = _mm_load_ps( (const float *) a2 ); - d.v = _mm_load_ps( (const float *) a3 ); + a.v = _mm_load_ps( (const float*)a0 ); + b.v = _mm_load_ps( (const float*)a1 ); + c.v = _mm_load_ps( (const float*)a2 ); + d.v = _mm_load_ps( (const float*)a3 ); - r = _mm_unpackhi_ps( a.v, b.v ); - s = _mm_unpacklo_ps( a.v, b.v ); + r = _mm_unpackhi_ps( a.v, b.v ); + s = _mm_unpacklo_ps( a.v, b.v ); - t = _mm_unpackhi_ps( c.v, d.v ); - u = _mm_unpacklo_ps( c.v, d.v ); + t = _mm_unpackhi_ps( c.v, d.v ); + u = _mm_unpacklo_ps( c.v, d.v ); a.v = _mm_movelh_ps( s, u ); b.v = _mm_movehl_ps( u, s ); c.v = _mm_movelh_ps( r, t ); d.v = _mm_movehl_ps( t, r ); - } +} - #if 0 +#if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -390,9 +364,9 @@ namespace v4 b.v = _mm_movehl_ps( c_v, a_v ); d.v = _mm_movehl_ps( u, t ); } - #endif +#endif - #if 0 +#if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -417,9 +391,9 @@ namespace v4 d_v = _mm_movehl_ps( u, t ); a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v; } - #endif +#endif - #if 0 +#if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -446,9 +420,9 @@ namespace v4 c.v = _mm_movelh_ps( t, u ); d.v = _mm_movehl_ps( u, t ); } - #endif +#endif - #if 0 +#if 0 inline void load_4x4_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, const void * ALIGNED(16) a2, @@ -475,80 +449,64 @@ namespace v4 d.v = _mm_movehl_ps( u, t ); c.v = _mm_movelh_ps( t, u ); } - #endif - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (float *) a0 )[0] = a.f[0]; - ( (float *) a1 )[0] = a.f[1]; - ( (float *) a2 )[0] = a.f[2]; - ( (float *) a3 )[0] = a.f[3]; - } +#endif - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (float*)a0 )[0] = a.f[0]; + ( (float*)a1 )[0] = a.f[1]; + ( (float*)a2 )[0] = a.f[2]; + ( (float*)a3 )[0] = a.f[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ __m128 t; - t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64*)a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64*)a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 - } + _mm_storel_pi( (__m64*)a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64*)a3, t ); // a3 b3 -> a3 +} - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ __m128 t; - t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a.v, b.v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64*)a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64*)a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a.v, b.v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 + _mm_storel_pi( (__m64*)a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64*)a3, t ); // a3 b3 -> a3 - ( (float *) a0 )[2] = c.f[0]; - ( (float *) a1 )[2] = c.f[1]; - ( (float *) a2 )[2] = c.f[2]; - ( (float *) a3 )[2] = c.f[3]; - } + ( (float*)a0 )[2] = c.f[0]; + ( (float*)a1 )[2] = c.f[1]; + ( (float*)a2 )[2] = c.f[2]; + ( (float*)a3 )[2] = c.f[3]; +} - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ __m128 a_v, b_v, c_v, d_v, t, u; - t = _mm_unpackhi_ps( a.v, b.v ); + t = _mm_unpackhi_ps( a.v, b.v ); a_v = _mm_unpacklo_ps( a.v, b.v ); - u = _mm_unpackhi_ps( c.v, d.v ); + u = _mm_unpackhi_ps( c.v, d.v ); c_v = _mm_unpacklo_ps( c.v, d.v ); b_v = _mm_movehl_ps( c_v, a_v ); @@ -556,241 +514,259 @@ namespace v4 c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - _mm_store_ps( (float *) a0, a_v ); - _mm_store_ps( (float *) a1, b_v ); - _mm_store_ps( (float *) a2, c_v ); - _mm_store_ps( (float *) a3, d_v ); - } + _mm_store_ps( (float*)a0, a_v ); + _mm_store_ps( (float*)a1, b_v ); + _mm_store_ps( (float*)a2, c_v ); + _mm_store_ps( (float*)a3, d_v ); +} - ////////////// - // v4int class +////////////// +// v4int class - class v4int : public v4 - { +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; + union { + int i; + float f; + } u; - u.i = a; - v = _mm_set1_ps( u.f ); + u.i = a; + v = _mm_set1_ps( u.f ); } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - union - { - int i; - float f; - } u0, u1, u2, u3; + union { + int i; + float f; + } u0, u1, u2, u3; - u0.i = i0; - u1.i = i1; - u2.i = i2; - u3.i = i3; + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; - v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); + v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) - #undef ASSIGN +#undef ASSIGN - inline v4int &operator =( const v4int &b ) + inline v4int& operator=( const v4int& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } - inline v4int &operator ^=( const v4int &b ) + inline v4int& operator^=( const v4int& b ) { - v = _mm_xor_ps( v, b.v ); + v = _mm_xor_ps( v, b.v ); - return *this; + return *this; } - inline v4int &operator &=( const v4int &b ) + inline v4int& operator&=( const v4int& b ) { - v = _mm_and_ps( v, b.v ); + v = _mm_and_ps( v, b.v ); - return *this; + return *this; } - inline v4int &operator |=( const v4int &b ) + inline v4int& operator|=( const v4int& b ) { - v = _mm_or_ps( v, b.v ); + v = _mm_or_ps( v, b.v ); - return *this; + return *this; } // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; + inline int operator()( int n ) { return i[n]; } +}; + +// v4int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ } - }; - - // v4int prefix unary operators - - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int &a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - inline v4int operator +( const v4int &a ) - { +inline v4int operator+( const v4int& a ) +{ v4int b; b.v = a.v; return b; - } +} - PREFIX_UNARY(-) +PREFIX_UNARY( -) - inline v4int operator !( const v4int &a ) - { +inline v4int operator!( const v4int& a ) +{ v4int b; - b.i[0] = - ( ! a.i[0] ); - b.i[1] = - ( ! a.i[1] ); - b.i[2] = - ( ! a.i[2] ); - b.i[3] = - ( ! a.i[3] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); return b; - } +} - inline v4int operator ~( const v4int &a ) - { +inline v4int operator~( const v4int& a ) +{ v4int b; - union - { - int i; - float f; + union { + int i; + float f; } u; u.i = -1; @@ -798,124 +774,124 @@ namespace v4 b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) ); return b; - } +} - #undef PREFIX_UNARY +#undef PREFIX_UNARY - // v4int prefix increment / decrement +// v4int prefix increment / decrement - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int &a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ + } - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) - #undef PREFIX_INCDEC +#undef PREFIX_INCDEC - // v4int postfix increment / decrement +// v4int postfix increment / decrement - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int &a, int ) \ - { \ - v4int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - return b; \ - } +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + return b; \ + } - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) - #undef POSTFIX_INCDEC +#undef POSTFIX_INCDEC - // v4int binary operators +// v4int binary operators - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - return c; \ - } +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + return c; \ + } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) - #undef BINARY +#undef BINARY - inline v4int operator ^( const v4int &a, const v4int &b ) - { +inline v4int operator^( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_xor_ps( a.v, b.v ); return c; - } +} - inline v4int operator &( const v4int &a, const v4int &b ) - { +inline v4int operator&( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_and_ps( a.v, b.v ); return c; - } +} - inline v4int operator |( const v4int &a, const v4int &b ) - { +inline v4int operator|( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_or_ps( a.v, b.v ); return c; - } - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - return c; \ - } +} + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + return c; \ + } - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) - #undef LOGICAL +#undef LOGICAL - // v4int miscellaneous functions +// v4int miscellaneous functions - inline v4int abs( const v4int &a ) - { +inline v4int abs( const v4int& a ) +{ v4int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -924,148 +900,182 @@ namespace v4 b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; b.v = _mm_andnot_ps( c.v, a.v ); return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.v = _mm_and_ps( c.v, a.v ); return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; __m128 c_v = c.v; - tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), - _mm_and_ps( c_v, t.v ) ); + tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), _mm_and_ps( c_v, t.v ) ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - v = _mm_set1_ps( a ); + v = _mm_set1_ps( a ); } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - v = _mm_setr_ps( f0, f1, f2, f3 ); + v = _mm_setr_ps( f0, f1, f2, f3 ); } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op,intrin) \ - inline v4float &operator op( const v4float &b ) \ - { \ - v = intrin( v, b.v ); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } ASSIGN( +=, _mm_add_ps ) @@ -1073,61 +1083,55 @@ namespace v4 ASSIGN( *=, _mm_mul_ps ) ASSIGN( /=, _mm_div_ps ) - #undef ASSIGN +#undef ASSIGN - inline v4float &operator =( const v4float &b ) + inline v4float& operator=( const v4float& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.v = a.v; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1136,10 +1140,10 @@ namespace v4 b.v = t; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1148,12 +1152,12 @@ namespace v4 b.v = t; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; __m128 a_v = a.v; @@ -1162,10 +1166,10 @@ namespace v4 b.v = a_v; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; __m128 a_v = a.v; @@ -1174,161 +1178,158 @@ namespace v4 b.v = a_v; return b; - } +} - // v4float binary operators +// v4float binary operators - #define BINARY(op,intrin) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define BINARY( op, intrin ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - BINARY( +, _mm_add_ps ) - BINARY( -, _mm_sub_ps ) - BINARY( *, _mm_mul_ps ) - BINARY( /, _mm_div_ps ) +BINARY( +, _mm_add_ps ) +BINARY( -, _mm_sub_ps ) +BINARY( *, _mm_mul_ps ) +BINARY( /, _mm_div_ps ) - #undef BINARY +#undef BINARY - // v4float logical operators +// v4float logical operators - #define LOGICAL(op,intrin) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define LOGICAL( op, intrin ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - LOGICAL( <, _mm_cmplt_ps ) - LOGICAL( >, _mm_cmpgt_ps ) - LOGICAL( ==, _mm_cmpeq_ps ) - LOGICAL( <=, _mm_cmple_ps ) - LOGICAL( >=, _mm_cmpge_ps ) - LOGICAL( !=, _mm_cmpneq_ps ) +LOGICAL( <, _mm_cmplt_ps ) +LOGICAL( >, _mm_cmpgt_ps ) +LOGICAL( ==, _mm_cmpeq_ps ) +LOGICAL( <=, _mm_cmple_ps ) +LOGICAL( >=, _mm_cmpge_ps ) +LOGICAL( !=, _mm_cmpneq_ps ) - #undef LOGICAL +#undef LOGICAL - inline v4int operator &&( const v4float &a, const v4float &b ) - { +inline v4int operator&&( const v4float& a, const v4float& b ) +{ v4int c; __m128 vzero = _mm_setzero_ps(); - c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + c.v = + _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), _mm_cmpneq_ps( b.v, vzero ) ); return c; - } +} - inline v4int operator ||( const v4float &a, const v4float &b ) - { +inline v4int operator||( const v4float& a, const v4float& b ) +{ v4int c; __m128 vzero = _mm_setzero_ps(); - c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), _mm_cmpneq_ps( b.v, vzero ) ); return c; - } - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - return b; \ - } +} + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + return b; \ + } - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) - #undef CMATH_FR1 - #undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - inline v4float fabs( const v4float &a ) - { + inline v4float fabs( const v4float& a ) +{ v4float b; b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v ); return b; - } +} - inline v4float sqrt( const v4float &a ) - { +inline v4float sqrt( const v4float& a ) +{ v4float b; b.v = _mm_sqrt_ps( a.v ); return b; - } +} - inline v4float copysign( const v4float &a, const v4float &b ) - { +inline v4float copysign( const v4float& a, const v4float& b ) +{ v4float c; __m128 t = _mm_set1_ps( -0.0f ); - c.v = _mm_or_ps( _mm_and_ps( t, b.v ), - _mm_andnot_ps( t, a.v ) ); + c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) ); return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.v = _mm_rsqrt_ps( a.v ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; __m128 b_v; b_v = _mm_rsqrt_ps( a.v ); - b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ), - _mm_fnmadd_ps( a.v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) ), - b_v ), - b_v ); + b.v = _mm_fmadd_ps( + _mm_set1_ps( 0.5f ), + _mm_fnmadd_ps( a.v, _mm_mul_ps( b_v, _mm_mul_ps( b_v, b_v ) ), b_v ), + b_v ); return b; - } +} - #if 0 +#if 0 inline v4float rsqrt( const v4float &a ) { v4float b; @@ -1350,9 +1351,9 @@ namespace v4 return b; } - #endif +#endif - #if 0 +#if 0 inline v4float rsqrt( const v4float &a ) { v4float b; @@ -1370,33 +1371,31 @@ namespace v4 return b; } - #endif +#endif - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.v = _mm_rcp_ps( a.v ); return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; __m128 b_v; b_v = _mm_rcp_ps( a.v ); - b.v = _mm_fnmadd_ps( a.v, - _mm_mul_ps( b_v, b_v ), - _mm_add_ps( b_v, b_v ) ); + b.v = _mm_fnmadd_ps( a.v, _mm_mul_ps( b_v, b_v ), _mm_add_ps( b_v, b_v ) ); return b; - } +} - #if 0 +#if 0 inline v4float rcp( const v4float &a ) { v4float b; @@ -1413,9 +1412,9 @@ namespace v4 return b; } - #endif +#endif - #if 0 +#if 0 inline v4float rcp( const v4float &a ) { v4float b; @@ -1430,112 +1429,103 @@ namespace v4 return b; } - #endif +#endif - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_fmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_fmsub_ps( a.v, b.v, c.v ); return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_fnmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_andnot_ps( m.v, a.v ); return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_or_ps( m.v, a.v ); return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_xor_ps( m.v, a.v ); return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ __m128 l = _mm_set1_ps( 1.0f ); __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); __m128 z = wl.v, xy; - xy = _mm_add_ps( l, - _mm_xor_ps( s, - _mm_shuffle_ps( z, z, PERM(0,0,1,1) ) - ) - ); + xy = _mm_add_ps( + l, _mm_xor_ps( s, _mm_shuffle_ps( z, z, PERM( 0, 0, 1, 1 ) ) ) ); - z = _mm_add_ps( l, - _mm_xor_ps( s, - _mm_shuffle_ps( z, z, PERM(2,2,2,2) ) - ) - ); + z = _mm_add_ps( + l, _mm_xor_ps( s, _mm_shuffle_ps( z, z, PERM( 2, 2, 2, 2 ) ) ) ); - xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ), - _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) ); + xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM( 0, 1, 0, 1 ) ), + _mm_shuffle_ps( xy, xy, PERM( 2, 2, 3, 3 ) ) ); - wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) ); + wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM( 0, 0, 0, 0 ) ) ); - wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) ); - } + wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM( 1, 1, 1, 1 ) ) ); +} - #undef PERM +#undef PERM } // namespace v4 diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h index 170d2649..35f0bd2d 100644 --- a/src/util/v4/v4_neon.h +++ b/src/util/v4/v4_neon.h @@ -12,7 +12,7 @@ #define V4_NEON_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif // This does not work with gcc 5.3.1 and the -fopenmp-simd @@ -23,192 +23,172 @@ // #define ALWAYS_VECTORIZE _Pragma( "simd" ) -#define ALWAYS_VECTORIZE \ - _Pragma( "simd" ) \ - _Pragma( "vector aligned" ) +#define ALWAYS_VECTORIZE _Pragma( "simd" ) _Pragma( "vector aligned" ) -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void load_4x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &b00, v4 &b01, - v4 &b02, v4 &b03, - v4 &b04, v4 &b05, - v4 &b06, v4 &b07 ) ALWAYS_INLINE; - - friend inline void load_4x16_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &b00, v4 &b01, - v4 &b02, v4 &b03, - v4 &b04, v4 &b05, - v4 &b06, v4 &b07, - v4 &b08, v4 &b09, - v4 &b10, v4 &b11, - v4 &b12, v4 &b13, - v4 &b14, v4 &b15 ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x8_tr( const v4 &b00, const v4 &b01, - const v4 &b02, const v4 &b03, - const v4 &b04, const v4 &b05, - const v4 &b06, const v4 &b07, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void load_4x8_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& b00, + v4& b01, v4& b02, v4& b03, v4& b04, v4& b05, + v4& b06, v4& b07 ) ALWAYS_INLINE; + + friend inline void + load_4x16_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + v4& b00, v4& b01, v4& b02, v4& b03, v4& b04, v4& b05, v4& b06, + v4& b07, v4& b08, v4& b09, v4& b10, v4& b11, v4& b12, v4& b13, + v4& b14, v4& b15 ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void + store_4x8_tr( const v4& b00, const v4& b01, const v4& b02, const v4& b03, + const v4& b04, const v4& b05, const v4& b06, const v4& b07, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; - int32x4_t vsi; - uint32x4_t vui; - float32x4_t v; + union { + int i[4]; + float f[4]; + int32x4_t vsi; + uint32x4_t vui; + float32x4_t v; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[n]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[n]; return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; b.i[0] = a.i[i0]; @@ -217,12 +197,12 @@ namespace v4 b.i[3] = a.i[i3]; return b; - } +} - #define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ // __m128 a_v = a.v; // a.v = b.v; @@ -230,13 +210,13 @@ namespace v4 // b.v = a_v; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - sw( a.i[j], b.i[j] ); - } + for ( int j = 0; j < 4; j++ ) + sw( a.i[j], b.i[j] ); +} - #if 1 - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { +#if 1 +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ float32x4x2_t r, s; r = vtrnq_f32( a0.v, a1.v ); @@ -247,10 +227,10 @@ namespace v4 a1.v = vtrn1q_f64( r.val[1], s.val[1] ); a3.v = vtrn2q_f64( r.val[1], s.val[1] ); - } - #endif +} +#endif - #if 0 +#if 0 inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { float32x4_t r, s, t, u; @@ -267,9 +247,9 @@ namespace v4 a1.v = vtrn1q_f64( s, u ); a3.v = vtrn2q_f64( s, u ); } - #endif +#endif - #if 0 +#if 0 // Portable version. inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) { @@ -277,79 +257,69 @@ namespace v4 sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a2.i[3],a3.i[2] ); } - #endif +#endif - #undef sw +#undef sw - // v4 memory manipulation functions +// v4 memory manipulation functions - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.v = vld1q_f32( ( float * ) p ); - } +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.v = vld1q_f32( (float*)p ); +} - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - vst1q_f32( ( float * ) p, a.v ); - } +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + vst1q_f32( (float*)p, a.v ); +} - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ( (int * ALIGNED(16) ) p )[j] = a.i[j]; - } - - inline void clear_4x1( void * ALIGNED(16) p ) - { - vst1q_f32( ( float * ) p, vdupq_n_f32( 0.0f ) ); - } + for ( int j = 0; j < 4; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = a.i[j]; +} - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - vst1q_f32( ( float * ) dst, vld1q_f32( ( const float * ) src ) ); - } +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + vst1q_f32( (float*)p, vdupq_n_f32( 0.0f ) ); +} - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { - float32x4_t t = vld1q_f32( ( float * ) a ); +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + vst1q_f32( (float*)dst, vld1q_f32( (const float*)src ) ); +} - vst1q_f32( ( float * ) a, vld1q_f32( ( float * ) b ) ); - vst1q_f32( ( float * ) b, t ); - } +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ + float32x4_t t = vld1q_f32( (float*)a ); - // v4 transposed memory manipulation functions + vst1q_f32( (float*)a, vld1q_f32( (float*)b ) ); + vst1q_f32( (float*)b, t ); +} - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.i[0] = ( (const int *) a0 )[0]; - a.i[1] = ( (const int *) a1 )[0]; - a.i[2] = ( (const int *) a2 )[0]; - a.i[3] = ( (const int *) a3 )[0]; - } +// v4 transposed memory manipulation functions - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ float32x4_t r, s, t, u, a2_v, a3_v; - a.v = vld1q_f32( (const float *) a0 ); - b.v = vld1q_f32( (const float *) a1 ); - a2_v = vld1q_f32( (const float *) a2 ); - a3_v = vld1q_f32( (const float *) a3 ); + a.v = vld1q_f32( (const float*)a0 ); + b.v = vld1q_f32( (const float*)a1 ); + a2_v = vld1q_f32( (const float*)a2 ); + a3_v = vld1q_f32( (const float*)a3 ); r = vtrn1q_f32( a.v, b.v ); s = vtrn2q_f32( a.v, b.v ); @@ -359,49 +329,43 @@ namespace v4 a.v = vtrn1q_f64( r, t ); b.v = vtrn1q_f64( s, u ); - } +} - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ float32x4_t r, s, t, u, d_v; - a.v = vld1q_f32( (const float *) a0 ); - b.v = vld1q_f32( (const float *) a1 ); - c.v = vld1q_f32( (const float *) a2 ); - d_v = vld1q_f32( (const float *) a3 ); + a.v = vld1q_f32( (const float*)a0 ); + b.v = vld1q_f32( (const float*)a1 ); + c.v = vld1q_f32( (const float*)a2 ); + d_v = vld1q_f32( (const float*)a3 ); - r = vtrn1q_f32( a.v, b.v ); - s = vtrn2q_f32( a.v, b.v ); + r = vtrn1q_f32( a.v, b.v ); + s = vtrn2q_f32( a.v, b.v ); - t = vtrn1q_f32( c.v, d_v ); - u = vtrn2q_f32( c.v, d_v ); + t = vtrn1q_f32( c.v, d_v ); + u = vtrn2q_f32( c.v, d_v ); a.v = vtrn1q_f64( r, t ); b.v = vtrn1q_f64( s, u ); c.v = vtrn2q_f64( r, t ); - } +} - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ float32x4_t r, s, t, u; - a.v = vld1q_f32( (const float *) a0 ); - b.v = vld1q_f32( (const float *) a1 ); - c.v = vld1q_f32( (const float *) a2 ); - d.v = vld1q_f32( (const float *) a3 ); + a.v = vld1q_f32( (const float*)a0 ); + b.v = vld1q_f32( (const float*)a1 ); + c.v = vld1q_f32( (const float*)a2 ); + d.v = vld1q_f32( (const float*)a3 ); r = vtrn1q_f32( a.v, b.v ); s = vtrn2q_f32( a.v, b.v ); @@ -413,24 +377,17 @@ namespace v4 b.v = vtrn1q_f64( s, u ); c.v = vtrn2q_f64( r, t ); d.v = vtrn2q_f64( s, u ); - } - - #if 1 - inline void load_4x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &b00, - v4 &b01, - v4 &b02, - v4 &b03, - v4 &b04, - v4 &b05, - v4 &b06, - v4 &b07 ) - { - float32x4x4_t mat0 = vld4q_f32( (const float *) a0 ); - float32x4x4_t mat2 = vld4q_f32( (const float *) a2 ); +} + +#if 1 +inline void load_4x8_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& b00, v4& b01, + v4& b02, v4& b03, v4& b04, v4& b05, v4& b06, v4& b07 ) +{ + float32x4x4_t mat0 = vld4q_f32( (const float*)a0 ); + float32x4x4_t mat2 = vld4q_f32( (const float*)a2 ); b00.v = vuzp1q_f32( mat0.val[0], mat2.val[0] ); b01.v = vuzp1q_f32( mat0.val[1], mat2.val[1] ); @@ -441,38 +398,25 @@ namespace v4 b05.v = vuzp2q_f32( mat0.val[1], mat2.val[1] ); b06.v = vuzp2q_f32( mat0.val[2], mat2.val[2] ); b07.v = vuzp2q_f32( mat0.val[3], mat2.val[3] ); - } - #endif - - #if 1 - inline void load_4x16_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &b00, - v4 &b01, - v4 &b02, - v4 &b03, - v4 &b04, - v4 &b05, - v4 &b06, - v4 &b07, - v4 &b08, - v4 &b09, - v4 &b10, - v4 &b11, - v4 &b12, - v4 &b13, - v4 &b14, - v4 &b15 ) - { +} +#endif + +#if 1 +inline void load_4x16_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& b00, v4& b01, + v4& b02, v4& b03, v4& b04, v4& b05, v4& b06, v4& b07, + v4& b08, v4& b09, v4& b10, v4& b11, v4& b12, v4& b13, + v4& b14, v4& b15 ) +{ float32x4_t c00, c01, c02, c03, c04, c05, c06, c07; float32x4_t c08, c09, c10, c11, c12, c13, c14, c15; - float32x4x4_t mat0 = vld4q_f32( (const float *) a0 ); - float32x4x4_t mat1 = vld4q_f32( (const float *) a1 ); - float32x4x4_t mat2 = vld4q_f32( (const float *) a2 ); - float32x4x4_t mat3 = vld4q_f32( (const float *) a3 ); + float32x4x4_t mat0 = vld4q_f32( (const float*)a0 ); + float32x4x4_t mat1 = vld4q_f32( (const float*)a1 ); + float32x4x4_t mat2 = vld4q_f32( (const float*)a2 ); + float32x4x4_t mat3 = vld4q_f32( (const float*)a3 ); c00 = vuzp1q_f32( mat0.val[0], mat1.val[0] ); c01 = vuzp1q_f32( mat0.val[1], mat1.val[1] ); @@ -511,28 +455,21 @@ namespace v4 b13.v = vuzp2q_f32( c05, c13 ); b14.v = vuzp2q_f32( c06, c14 ); b15.v = vuzp2q_f32( c07, c15 ); - } - #endif - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (int *) a0 )[0] = a.i[0]; - ( (int *) a1 )[0] = a.i[1]; - ( (int *) a2 )[0] = a.i[2]; - ( (int *) a3 )[0] = a.i[3]; - } +} +#endif - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ // __m128 a_v = a.v, b_v = b.v, t; // t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t @@ -545,27 +482,23 @@ namespace v4 // _mm_storel_pi( (__m64 *)a2, t ); // a2 b2 -> a2 // _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3 -> a3 - ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; - ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; - ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; - ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; - } + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; +} - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ // __m128 a_v = a.v, b_v = b.v, t; // t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t @@ -583,32 +516,27 @@ namespace v4 // ((float *)a2)[2] = c.f[2]; // ((float *)a3)[2] = c.f[3]; - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - } + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; +} - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ float32x4_t r, s, t, u; r = vtrn1q_f32( a.v, b.v ); @@ -617,26 +545,19 @@ namespace v4 t = vtrn1q_f32( c.v, d.v ); u = vtrn2q_f32( c.v, d.v ); - vst1q_f32( (float *) a0, vtrn1q_f64( r, t ) ); - vst1q_f32( (float *) a1, vtrn1q_f64( s, u ) ); - vst1q_f32( (float *) a2, vtrn2q_f64( r, t ) ); - vst1q_f32( (float *) a3, vtrn2q_f64( s, u ) ); - } - - #if 1 - inline void store_4x8_tr( const v4 &b00, - const v4 &b01, - const v4 &b02, - const v4 &b03, - const v4 &b04, - const v4 &b05, - const v4 &b06, - const v4 &b07, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { + vst1q_f32( (float*)a0, vtrn1q_f64( r, t ) ); + vst1q_f32( (float*)a1, vtrn1q_f64( s, u ) ); + vst1q_f32( (float*)a2, vtrn2q_f64( r, t ) ); + vst1q_f32( (float*)a3, vtrn2q_f64( s, u ) ); +} + +#if 1 +inline void store_4x8_tr( const v4& b00, const v4& b01, const v4& b02, + const v4& b03, const v4& b04, const v4& b05, + const v4& b06, const v4& b07, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) +{ float32x4x4_t mat0, mat2; mat0.val[0] = vuzp1q_f32( b00.v, b04.v ); @@ -649,351 +570,371 @@ namespace v4 mat2.val[2] = vuzp2q_f32( b02.v, b06.v ); mat2.val[3] = vuzp2q_f32( b03.v, b07.v ); - vst4q_f32( (float *) a0, mat0 ); - vst4q_f32( (float *) a2, mat2 ); - } - #endif + vst4q_f32( (float*)a0, mat0 ); + vst4q_f32( (float*)a2, mat2 ); +} +#endif - ////////////// - // v4int class +////////////// +// v4int class - class v4int : public v4 - { +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; - - u.i = a; - v = vdupq_n_f32( u.f ); + union { + int i; + float f; + } u; + + u.i = a; + v = vdupq_n_f32( u.f ); } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - // union - // { - // int i; - // float f; - // } u0, u1, u2, u3; - - // u0.i = i0; - // u1.i = i1; - // u2.i = i2; - // u3.i = i3; - - // v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); - - i[0] = i0; - i[1] = i1; - i[2] = i2; - i[3] = i3; + // union + // { + // int i; + // float f; + // } u0, u1, u2, u3; + + // u0.i = i0; + // u1.i = i1; + // u2.i = i2; + // u3.i = i3; + + // v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); + + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - i[j] op b.i[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + i[j] op b.i[j]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) - #undef ASSIGN +#undef ASSIGN - inline v4int &operator =( const v4int &b ) + inline v4int& operator=( const v4int& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } - inline v4int &operator ^=( const v4int &b ) + inline v4int& operator^=( const v4int& b ) { - vsi = veorq_s32( vsi, b.vsi ); + vsi = veorq_s32( vsi, b.vsi ); - return *this; + return *this; } - inline v4int &operator &=( const v4int &b ) + inline v4int& operator&=( const v4int& b ) { - vsi = vandq_s32( vsi, b.vsi ); + vsi = vandq_s32( vsi, b.vsi ); - return *this; + return *this; } - inline v4int &operator |=( const v4int &b ) + inline v4int& operator|=( const v4int& b ) { - vsi = vorrq_s32( vsi, b.vsi ); + vsi = vorrq_s32( vsi, b.vsi ); - return *this; + return *this; } // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; + inline int operator()( int n ) { return i[n]; } +}; + +// v4int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ } - }; - - // v4int prefix unary operators - - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int &a ) \ - { \ - v4int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } - PREFIX_UNARY(+) - PREFIX_UNARY(-) +PREFIX_UNARY( +) +PREFIX_UNARY( -) - inline v4int operator !( const v4int &a ) - { +inline v4int operator!( const v4int& a ) +{ v4int b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = - ( !a.i[j] ); + for ( int j = 0; j < 4; j++ ) + b.i[j] = -( !a.i[j] ); return b; - } +} - PREFIX_UNARY(~) +PREFIX_UNARY( ~) - #undef PREFIX_UNARY +#undef PREFIX_UNARY - // v4int prefix increment / decrement +// v4int prefix increment / decrement - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int &a ) \ - { \ - v4int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) - #undef PREFIX_INCDEC +#undef PREFIX_INCDEC - // v4int postfix increment / decrement +// v4int postfix increment / decrement - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int &a, int ) \ - { \ - v4int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.i[j] = ( a.i[j] op ); \ - return b; \ - } +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.i[j] = ( a.i[j] op ); \ + return b; \ + } - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) - #undef POSTFIX_INCDEC +#undef POSTFIX_INCDEC - // v4int binary operators +// v4int binary operators - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.i[j] = a.i[j] op b.i[j]; \ - return c; \ - } +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.i[j] = a.i[j] op b.i[j]; \ + return c; \ + } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) - #undef BINARY +#undef BINARY - inline v4int operator ^( const v4int &a, const v4int &b ) - { +inline v4int operator^( const v4int& a, const v4int& b ) +{ v4int c; c.vsi = veorq_s32( a.vsi, b.vsi ); return c; - } +} - inline v4int operator &( const v4int &a, const v4int &b ) - { +inline v4int operator&( const v4int& a, const v4int& b ) +{ v4int c; c.vsi = vandq_s32( a.vsi, b.vsi ); return c; - } +} - inline v4int operator |( const v4int &a, const v4int &b ) - { +inline v4int operator|( const v4int& a, const v4int& b ) +{ v4int c; c.vsi = vorrq_s32( a.vsi, b.vsi ); return c; - } - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.i[j] = - ( a.i[j] op b.i[j] ); \ - return c; \ - } +} + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.i[j] = -( a.i[j] op b.i[j] ); \ + return c; \ + } - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) - #undef LOGICAL +#undef LOGICAL - // v4int miscellaneous functions +// v4int miscellaneous functions - inline v4int abs( const v4int &a ) - { +inline v4int abs( const v4int& a ) +{ v4int b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; // This seems broken. @@ -1002,14 +943,14 @@ namespace v4 // b.v = _mm_andnot_ps( c.v, a.v ); ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] & ~c.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.vsi = vandq_s32( c.vsi, a.vsi ); @@ -1021,10 +962,10 @@ namespace v4 // b.i[j] = a.i[j] & c.i[j]; return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; // This seems broken. @@ -1038,127 +979,162 @@ namespace v4 // _mm_and_ps( c_v, t.v ) ); ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + for ( int j = 0; j < 4; j++ ) + tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - v = vdupq_n_f32( a ); + v = vdupq_n_f32( a ); } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - // v = _mm_setr_ps( f0, f1, f2, f3 ); + // v = _mm_setr_ps( f0, f1, f2, f3 ); - f[0] = f0; - f[1] = f1; - f[2] = f2; - f[3] = f3; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op,intrin) \ - inline v4float &operator op( const v4float &b ) \ - { \ - v = intrin( v, b.v ); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } ASSIGN( +=, vaddq_f32 ) @@ -1166,69 +1142,63 @@ namespace v4 ASSIGN( *=, vmulq_f32 ) ASSIGN( /=, vdivq_f32 ) - #undef ASSIGN +#undef ASSIGN - inline v4float &operator =( const v4float &b ) + inline v4float& operator=( const v4float& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.v = a.v; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; // b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = -a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = -a.f[j]; return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; // b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] ? 0 : -1; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] ? 0 : -1; return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; // __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1237,14 +1207,14 @@ namespace v4 // b.v = t; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ++a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = ++a.f[j]; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; // __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1253,16 +1223,16 @@ namespace v4 // b.v = t; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = --a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = --a.f[j]; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; // __m128 a_v = a.v; @@ -1271,14 +1241,14 @@ namespace v4 // b.v = a_v; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = a.f[j]++; + for ( int j = 0; j < 4; j++ ) + b.f[j] = a.f[j]++; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; // __m128 a_v = a.v; @@ -1287,49 +1257,49 @@ namespace v4 // b.v = a_v; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = a.f[j]--; + for ( int j = 0; j < 4; j++ ) + b.f[j] = a.f[j]--; return b; - } +} - // v4float binary operators +// v4float binary operators - #define BINARY(op,intrin) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define BINARY( op, intrin ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - BINARY( +, vaddq_f32 ) - BINARY( -, vsubq_f32 ) - BINARY( *, vmulq_f32 ) - BINARY( /, vdivq_f32 ) +BINARY( +, vaddq_f32 ) +BINARY( -, vsubq_f32 ) +BINARY( *, vmulq_f32 ) +BINARY( /, vdivq_f32 ) - #undef BINARY +#undef BINARY - // v4float logical operators +// v4float logical operators - #define LOGICAL(op,intrin) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define LOGICAL( op, intrin ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - LOGICAL( <, vcltq_f32 ) - LOGICAL( >, vcgtq_f32 ) - LOGICAL( ==, vceqq_f32 ) - LOGICAL( <=, vcleq_f32 ) - LOGICAL( >=, vcgeq_f32 ) +LOGICAL( <, vcltq_f32 ) +LOGICAL( >, vcgtq_f32 ) +LOGICAL( ==, vceqq_f32 ) +LOGICAL( <=, vcleq_f32 ) +LOGICAL( >=, vcgeq_f32 ) - #undef LOGICAL +#undef LOGICAL - inline v4int operator !=( const v4float &a, const v4float &b ) - { +inline v4int operator!=( const v4float& a, const v4float& b ) +{ v4int c; // r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); @@ -1339,143 +1309,139 @@ namespace v4 c.vui = vmvnq_u32( vceqq_f32( a.v, b.v ) ); return c; - } +} - inline v4int operator &&( const v4float &a, const v4float &b ) - { +inline v4int operator&&( const v4float& a, const v4float& b ) +{ v4int c; - float32x4_t vzero = vdupq_n_f32(0.0f); + float32x4_t vzero = vdupq_n_f32( 0.0f ); // __m128 vzero = _mm_setzero_ps(); // Is there a better way to do this than the SSE way? - c.vsi = vandq_s32( vmvnq_u32( vceqq_f32( a.v, - vzero ) ), - vmvnq_u32( vceqq_f32( b.v, - vzero ) ) ); + c.vsi = vandq_s32( vmvnq_u32( vceqq_f32( a.v, vzero ) ), + vmvnq_u32( vceqq_f32( b.v, vzero ) ) ); // c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), // _mm_cmpneq_ps( b.v, vzero ) ); return c; - } +} - inline v4int operator ||( const v4float &a, const v4float &b ) - { +inline v4int operator||( const v4float& a, const v4float& b ) +{ v4int c; - float32x4_t vzero = vdupq_n_f32(0.0f); + float32x4_t vzero = vdupq_n_f32( 0.0f ); // __m128 vzero = _mm_setzero_ps(); // Is there a better way to do this than the SSE way? - c.vsi = vorrq_s32( vmvnq_u32( vceqq_f32( a.v, - vzero ) ), - vmvnq_u32( vceqq_f32( b.v, - vzero ) ) ); + c.vsi = vorrq_s32( vmvnq_u32( vceqq_f32( a.v, vzero ) ), + vmvnq_u32( vceqq_f32( b.v, vzero ) ) ); // c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), // _mm_cmpneq_ps( b.v, vzero ) ); return c; - } - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.f[j] = ::fn( a.f[j] ); \ - return b; \ - } +} + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.f[j] = ::fn( a.f[j] ); \ + return b; \ + } - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.f[j] = ::fn( a.f[j], b.f[j] ); \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.f[j] = ::fn( a.f[j], b.f[j] ); \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) - #undef CMATH_FR1 - #undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - inline v4float copysign( const v4float &a, const v4float &b ) - { + inline v4float + copysign( const v4float& a, const v4float& b ) +{ v4float c; float t; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) + for ( int j = 0; j < 4; j++ ) { - t = ::fabs( a.f[j] ); - if( b.f[j] < 0 ) t = -t; - c.f[j] = t; + t = ::fabs( a.f[j] ); + if ( b.f[j] < 0 ) + t = -t; + c.f[j] = t; } return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.v = vrsqrteq_f32( a.v ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; float32x4_t a_v = a.v, b_v; b_v = vrsqrteq_f32( a_v ); - b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ), - vsubq_f32( b_v, - vmulq_f32( a_v, - vmulq_f32( b_v, - vmulq_f32( b_v, b_v ) - ) - ) - ) - ) - ); + b.v = vaddq_f32( + b_v, + vmulq_f32( + vdupq_n_f32( 0.5f ), + vsubq_f32( + b_v, + vmulq_f32( a_v, vmulq_f32( b_v, vmulq_f32( b_v, b_v ) ) ) ) ) ); // ALWAYS_VECTORIZE // for( int j = 0; j < 4; j++ ) // b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.v = vrecpeq_f32( a.v ); return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; float32x4_t a_v = a.v, b_v; @@ -1483,20 +1449,17 @@ namespace v4 b_v = vrecpeq_f32( a_v ); b.v = vsubq_f32( vaddq_f32( b_v, b_v ), - vmulq_f32( a_v, - vmulq_f32( b_v, b_v ) - ) - ); + vmulq_f32( a_v, vmulq_f32( b_v, b_v ) ) ); // ALWAYS_VECTORIZE // for( int j = 0; j < 4; j++ ) // b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = vfmaq_f32( c.v, a.v, b.v ); @@ -1505,10 +1468,10 @@ namespace v4 // d.v = vaddq_f32( vmulq_f32( a.v, b.v ), c.v ); return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = vfmsq_f32( c.v, a.v, b.v ); @@ -1517,19 +1480,19 @@ namespace v4 // d.v = vsubq_f32( vmulq_f32( a.v, b.v ), c.v ); return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = vsubq_f32( c.v, vmulq_f32( a.v, b.v ) ); return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.vsi = vbicq_s32( m.vsi, a.vsi ); @@ -1541,10 +1504,10 @@ namespace v4 // b.i[j] = ( ~m.i[j] ) & a.i[j]; return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.vsi = vorrq_s32( m.vsi, a.vsi ); @@ -1556,10 +1519,10 @@ namespace v4 // b.i[j] = m.i[j] | a.i[j]; return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.vsi = veorq_s32( m.vsi, a.vsi ); @@ -1571,31 +1534,28 @@ namespace v4 // b.i[j] = m.i[j] ^ a.i[j]; return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ vst1q_f32( p, vaddq_f32( vld1q_f32( p ), a.v ) ); - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ vst1q_f32( p, vsubq_f32( vld1q_f32( p ), a.v ) ); - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ vst1q_f32( p, vmulq_f32( vld1q_f32( p ), a.v ) ); - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ float x = wl.f[0], y = wl.f[1], z = wl.f[2]; wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z ); @@ -1607,7 +1567,7 @@ namespace v4 wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z ); wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z ); wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z ); - } +} } // namespace v4 diff --git a/src/util/v4/v4_portable.h b/src/util/v4/v4_portable.h index b192c514..48e0b516 100644 --- a/src/util/v4/v4_portable.h +++ b/src/util/v4/v4_portable.h @@ -11,148 +11,140 @@ #define V4_PORTABLE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; + union { + int i[4]; + float f[4]; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - i[0]=a.i[0]; - i[1]=a.i[1]; - i[2]=a.i[2]; - i[3]=a.i[3]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; b.i[0] = a.i[n]; @@ -161,11 +153,11 @@ namespace v4 b.i[3] = a.i[n]; return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; b.i[0] = a.i[i0]; @@ -174,552 +166,544 @@ namespace v4 b.i[3] = a.i[i3]; return b; - } +} - #define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ sw( a.i[0], b.i[0] ); sw( a.i[1], b.i[1] ); sw( a.i[2], b.i[2] ); sw( a.i[3], b.i[3] ); - } - - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); - sw( a2.i[3],a3.i[2] ); - } - - #undef sw - - // v4 memory manipulation functions - - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) p )[0]; - a.i[1] = ( ( const int * ALIGNED(16) ) p )[1]; - a.i[2] = ( ( const int * ALIGNED(16) ) p )[2]; - a.i[3] = ( ( const int * ALIGNED(16) ) p )[3]; - } - - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; - ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; - ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; - } - - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; - ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; - ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; - } - - inline void clear_4x1( void * ALIGNED(16) p ) - { - ( ( int * ALIGNED(16) ) p )[0] = 0; - ( ( int * ALIGNED(16) ) p )[1] = 0; - ( ( int * ALIGNED(16) ) p )[2] = 0; - ( ( int * ALIGNED(16) ) p )[3] = 0; - } - - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - ( ( int * ALIGNED(16) ) dst )[0] = ( ( const int * ALIGNED(16) ) src )[0]; - ( ( int * ALIGNED(16) ) dst )[1] = ( ( const int * ALIGNED(16) ) src )[1]; - ( ( int * ALIGNED(16) ) dst )[2] = ( ( const int * ALIGNED(16) ) src )[2]; - ( ( int * ALIGNED(16) ) dst )[3] = ( ( const int * ALIGNED(16) ) src )[3]; - } - - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +} + +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ + sw( a0.i[1], a1.i[0] ); + sw( a0.i[2], a2.i[0] ); + sw( a0.i[3], a3.i[0] ); + sw( a1.i[2], a2.i[1] ); + sw( a1.i[3], a3.i[1] ); + sw( a2.i[3], a3.i[2] ); +} + +#undef sw + +// v4 memory manipulation functions + +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 16 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 16 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 16 ))p )[3]; +} + +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; +} + +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; +} + +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = 0; + ( (int* ALIGNED( 16 ))p )[1] = 0; + ( (int* ALIGNED( 16 ))p )[2] = 0; + ( (int* ALIGNED( 16 ))p )[3] = 0; +} + +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + ( (int* ALIGNED( 16 ))dst )[0] = ( (const int* ALIGNED( 16 ))src )[0]; + ( (int* ALIGNED( 16 ))dst )[1] = ( (const int* ALIGNED( 16 ))src )[1]; + ( (int* ALIGNED( 16 ))dst )[2] = ( (const int* ALIGNED( 16 ))src )[2]; + ( (int* ALIGNED( 16 ))dst )[3] = ( (const int* ALIGNED( 16 ))src )[3]; +} + +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; - t = ( ( int * ALIGNED(16) ) a )[0]; - - ( ( int * ALIGNED(16) ) a )[0] = ( ( int * ALIGNED(16) ) b )[0]; - ( ( int * ALIGNED(16) ) b )[0] = t; - - t = ( ( int * ALIGNED(16) ) a )[1]; - - ( ( int * ALIGNED(16) ) a )[1] = ( ( int * ALIGNED(16) ) b )[1]; - ( ( int * ALIGNED(16) ) b )[1] = t; - - t = ( ( int * ALIGNED(16) ) a )[2]; - - ( ( int * ALIGNED(16) ) a )[2] = ( ( int * ALIGNED(16) ) b )[2]; - ( ( int * ALIGNED(16) ) b )[2] = t; - - t = ( ( int * ALIGNED(16) ) a )[3]; - - ( ( int * ALIGNED(16) ) a )[3] = ( ( int * ALIGNED(16) ) b )[3]; - ( ( int * ALIGNED(16) ) b )[3] = t; - } - - // v4 transposed memory manipulation functions - - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.i[0] = ( (const int *) a0 )[0]; - a.i[1] = ( (const int *) a1 )[0]; - a.i[2] = ( (const int *) a2 )[0]; - a.i[3] = ( (const int *) a3 )[0]; - } - - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { - a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - - a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - - a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - - a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; - } - - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; - c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - - a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; - c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - - a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; - c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - - a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; - c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; - } - - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; - c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3]; - - a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; - c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3]; - - a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; - c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3]; - - a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; - c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; - d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3]; - } - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (int *) a0 )[0] = a.i[0]; - ( (int *) a1 )[0] = a.i[1]; - ( (int *) a2 )[0] = a.i[2]; - ( (int *) a3 )[0] = a.i[3]; - } - - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { - ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; - - ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; - - ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; - - ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; - } - - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - } - - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; - } - - ////////////// - // v4int class - - class v4int : public v4 - { + t = ( (int* ALIGNED( 16 ))a )[0]; + + ( (int* ALIGNED( 16 ))a )[0] = ( (int* ALIGNED( 16 ))b )[0]; + ( (int* ALIGNED( 16 ))b )[0] = t; + + t = ( (int* ALIGNED( 16 ))a )[1]; + + ( (int* ALIGNED( 16 ))a )[1] = ( (int* ALIGNED( 16 ))b )[1]; + ( (int* ALIGNED( 16 ))b )[1] = t; + + t = ( (int* ALIGNED( 16 ))a )[2]; + + ( (int* ALIGNED( 16 ))a )[2] = ( (int* ALIGNED( 16 ))b )[2]; + ( (int* ALIGNED( 16 ))b )[2] = t; + + t = ( (int* ALIGNED( 16 ))a )[3]; + + ( (int* ALIGNED( 16 ))a )[3] = ( (int* ALIGNED( 16 ))b )[3]; + ( (int* ALIGNED( 16 ))b )[3] = t; +} + +// v4 transposed memory manipulation functions + +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a0 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a1 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a2 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a3 )[1]; +} + +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; +} + +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; +} + +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; +} + +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; +} + +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; +} + +////////////// +// v4int class + +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - i[0] = a.i[0]; - i[1] = a.i[1]; - i[2] = a.i[2]; - i[3] = a.i[3]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - i[0] = a.i[0]; - i[1] = a.i[1]; - i[2] = a.i[2]; - i[3] = a.i[3]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - i[0] = a; - i[1] = a; - i[2] = a; - i[3] = a; + i[0] = a; + i[1] = a; + i[2] = a; + i[3] = a; } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - i[0] = i0; - i[1] = i1; - i[2] = i2; - i[3] = i3; + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) - ASSIGN( =) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) + ASSIGN( = ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) - #undef ASSIGN +#undef ASSIGN // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; + inline int operator()( int n ) { return i[n]; } +}; + +// v4int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ } - }; - - // v4int prefix unary operators - - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - - PREFIX_UNARY(+) - PREFIX_UNARY(-) - - inline v4int operator !( const v4int & a ) - { + +PREFIX_UNARY( +) +PREFIX_UNARY( -) + +inline v4int operator!( const v4int& a ) +{ v4int b; - b.i[0] = - ( ! a.i[0] ); - b.i[1] = - ( ! a.i[1] ); - b.i[2] = - ( ! a.i[2] ); - b.i[3] = - ( ! a.i[3] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); return b; - } - - PREFIX_UNARY(~) - - #undef PREFIX_UNARY - - // v4int prefix increment / decrement - - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - - #undef PREFIX_INCDEC - - // v4int postfix increment / decrement - - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) \ - { \ - v4int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - - #undef POSTFIX_INCDEC - - // v4int binary operators - - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) - BINARY(^) - BINARY(&) - BINARY(|) - - #undef BINARY - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4int miscellaneous functions - - inline v4int abs( const v4int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v4int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v4int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v4int binary operators + +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) +BINARY( ^) +BINARY( & ) +BINARY( | ) + +#undef BINARY + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4int miscellaneous functions + +inline v4int abs( const v4int& a ) +{ v4int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -728,10 +712,10 @@ namespace v4 b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; b.i[0] = a.i[0] & ~c.i[0]; @@ -740,10 +724,10 @@ namespace v4 b.i[3] = a.i[3] & ~c.i[3]; return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.i[0] = a.i[0] & c.i[0]; @@ -752,10 +736,10 @@ namespace v4 b.i[3] = a.i[3] & c.i[3]; return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; tf.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); @@ -764,160 +748,189 @@ namespace v4 tf.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - f[0] = a; - f[1] = a; - f[2] = a; - f[3] = a; + f[0] = a; + f[1] = a; + f[2] = a; + f[3] = a; } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - f[0] = f0; - f[1] = f1; - f[2] = f2; - f[3] = f3; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op) \ - inline v4float &operator op( const v4float &b ) \ - { \ - f[0] op b.f[0]; \ - f[1] op b.f[1]; \ - f[2] op b.f[2]; \ - f[3] op b.f[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + f[0] op b.f[0]; \ + f[1] op b.f[1]; \ + f[2] op b.f[2]; \ + f[3] op b.f[3]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) - #undef ASSIGN +#undef ASSIGN // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.f[0] = +a.f[0]; @@ -926,10 +939,10 @@ namespace v4 b.f[3] = +a.f[3]; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; b.f[0] = -a.f[0]; @@ -938,10 +951,10 @@ namespace v4 b.f[3] = -a.f[3]; return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; b.i[0] = a.i[0] ? 0 : -1; @@ -950,12 +963,12 @@ namespace v4 b.i[3] = a.i[3] ? 0 : -1; return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; b.f[0] = ++a.f[0]; @@ -964,10 +977,10 @@ namespace v4 b.f[3] = ++a.f[3]; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; b.f[0] = --a.f[0]; @@ -976,12 +989,12 @@ namespace v4 b.f[3] = --a.f[3]; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; b.f[0] = a.f[0]++; @@ -990,10 +1003,10 @@ namespace v4 b.f[3] = a.f[3]++; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; b.f[0] = a.f[0]--; @@ -1002,113 +1015,119 @@ namespace v4 b.f[3] = a.f[3]--; return b; - } - - // v4float binary operators - - #define BINARY(op) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = a.f[0] op b.f[0]; \ - c.f[1] = a.f[1] op b.f[1]; \ - c.f[2] = a.f[2] op b.f[2]; \ - c.f[3] = a.f[3] op b.f[3]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - - #undef BINARY - - // v4float logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.f[0] op b.f[0] ); \ - c.i[1] = - ( a.f[1] op b.f[1] ); \ - c.i[2] = - ( a.f[2] op b.f[2] ); \ - c.i[3] = - ( a.f[3] op b.f[3] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(!=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - return b; \ - } - - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - #undef CMATH_FR1 - #undef CMATH_FR2 - - inline v4float copysign( const v4float &a, const v4float &b ) - { +} + +// v4float binary operators + +#define BINARY( op ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = a.f[0] op b.f[0]; \ + c.f[1] = a.f[1] op b.f[1]; \ + c.f[2] = a.f[2] op b.f[2]; \ + c.f[3] = a.f[3] op b.f[3]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v4float logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.f[0] op b.f[0] ); \ + c.i[1] = -( a.f[1] op b.f[1] ); \ + c.i[2] = -( a.f[2] op b.f[2] ); \ + c.i[3] = -( a.f[3] op b.f[3] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( != ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + +#undef CMATH_FR1 +#undef CMATH_FR2 + + inline v4float + copysign( const v4float& a, const v4float& b ) +{ v4float c; float t; t = ::fabs( a.f[0] ); - if ( b.f[0] < 0 ) t = -t; + if ( b.f[0] < 0 ) + t = -t; c.f[0] = t; t = ::fabs( a.f[1] ); - if ( b.f[1] < 0 ) t = -t; + if ( b.f[1] < 0 ) + t = -t; c.f[1] = t; t = ::fabs( a.f[2] ); - if ( b.f[2] < 0 ) t = -t; + if ( b.f[2] < 0 ) + t = -t; c.f[2] = t; t = ::fabs( a.f[3] ); - if ( b.f[3] < 0 ) t = -t; + if ( b.f[3] < 0 ) + t = -t; c.f[3] = t; return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1117,10 +1136,10 @@ namespace v4 b.f[3] = ::sqrt( 1.0f / a.f[3] ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1129,10 +1148,10 @@ namespace v4 b.f[3] = ::sqrt( 1.0f / a.f[3] ); return b; - } +} - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.f[0] = 1.0f / a.f[0]; @@ -1141,10 +1160,10 @@ namespace v4 b.f[3] = 1.0f / a.f[3]; return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; b.f[0] = 1.0f / a.f[0]; @@ -1153,10 +1172,10 @@ namespace v4 b.f[3] = 1.0f / a.f[3]; return b; - } +} - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.f[0] = a.f[0] * b.f[0] + c.f[0]; @@ -1165,10 +1184,10 @@ namespace v4 d.f[3] = a.f[3] * b.f[3] + c.f[3]; return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.f[0] = a.f[0] * b.f[0] - c.f[0]; @@ -1177,10 +1196,10 @@ namespace v4 d.f[3] = a.f[3] * b.f[3] - c.f[3]; return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.f[0] = c.f[0] - a.f[0] * b.f[0]; @@ -1189,10 +1208,10 @@ namespace v4 d.f[3] = c.f[3] - a.f[3] * b.f[3]; return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.i[0] = ( ~m.i[0] ) & a.i[0]; @@ -1201,10 +1220,10 @@ namespace v4 b.i[3] = ( ~m.i[3] ) & a.i[3]; return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.i[0] = m.i[0] | a.i[0]; @@ -1213,10 +1232,10 @@ namespace v4 b.i[3] = m.i[3] | a.i[3]; return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.i[0] = m.i[0] ^ a.i[0]; @@ -1225,40 +1244,37 @@ namespace v4 b.i[3] = m.i[3] ^ a.i[3]; return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ p[0] += a.f[0]; p[1] += a.f[1]; p[2] += a.f[2]; p[3] += a.f[3]; - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ p[0] -= a.f[0]; p[1] -= a.f[1]; p[2] -= a.f[2]; p[3] -= a.f[3]; - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ p[0] *= a.f[0]; p[1] *= a.f[1]; p[2] *= a.f[2]; p[3] *= a.f[3]; - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ float x = wl.f[0], y = wl.f[1], z = wl.f[2]; wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z ); @@ -1270,7 +1286,7 @@ namespace v4 wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z ); wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z ); wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z ); - } +} } // namespace v4 diff --git a/src/util/v4/v4_portable_v0.h b/src/util/v4/v4_portable_v0.h index b192c514..48e0b516 100644 --- a/src/util/v4/v4_portable_v0.h +++ b/src/util/v4/v4_portable_v0.h @@ -11,148 +11,140 @@ #define V4_PORTABLE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; + union { + int i[4]; + float f[4]; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - i[0]=a.i[0]; - i[1]=a.i[1]; - i[2]=a.i[2]; - i[3]=a.i[3]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; b.i[0] = a.i[n]; @@ -161,11 +153,11 @@ namespace v4 b.i[3] = a.i[n]; return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; b.i[0] = a.i[i0]; @@ -174,552 +166,544 @@ namespace v4 b.i[3] = a.i[i3]; return b; - } +} - #define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ sw( a.i[0], b.i[0] ); sw( a.i[1], b.i[1] ); sw( a.i[2], b.i[2] ); sw( a.i[3], b.i[3] ); - } - - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); - sw( a2.i[3],a3.i[2] ); - } - - #undef sw - - // v4 memory manipulation functions - - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) p )[0]; - a.i[1] = ( ( const int * ALIGNED(16) ) p )[1]; - a.i[2] = ( ( const int * ALIGNED(16) ) p )[2]; - a.i[3] = ( ( const int * ALIGNED(16) ) p )[3]; - } - - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; - ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; - ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; - } - - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - ( ( int * ALIGNED(16) ) p )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) p )[1] = a.i[1]; - ( ( int * ALIGNED(16) ) p )[2] = a.i[2]; - ( ( int * ALIGNED(16) ) p )[3] = a.i[3]; - } - - inline void clear_4x1( void * ALIGNED(16) p ) - { - ( ( int * ALIGNED(16) ) p )[0] = 0; - ( ( int * ALIGNED(16) ) p )[1] = 0; - ( ( int * ALIGNED(16) ) p )[2] = 0; - ( ( int * ALIGNED(16) ) p )[3] = 0; - } - - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - ( ( int * ALIGNED(16) ) dst )[0] = ( ( const int * ALIGNED(16) ) src )[0]; - ( ( int * ALIGNED(16) ) dst )[1] = ( ( const int * ALIGNED(16) ) src )[1]; - ( ( int * ALIGNED(16) ) dst )[2] = ( ( const int * ALIGNED(16) ) src )[2]; - ( ( int * ALIGNED(16) ) dst )[3] = ( ( const int * ALIGNED(16) ) src )[3]; - } - - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +} + +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ + sw( a0.i[1], a1.i[0] ); + sw( a0.i[2], a2.i[0] ); + sw( a0.i[3], a3.i[0] ); + sw( a1.i[2], a2.i[1] ); + sw( a1.i[3], a3.i[1] ); + sw( a2.i[3], a3.i[2] ); +} + +#undef sw + +// v4 memory manipulation functions + +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 16 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 16 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 16 ))p )[3]; +} + +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; +} + +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; +} + +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = 0; + ( (int* ALIGNED( 16 ))p )[1] = 0; + ( (int* ALIGNED( 16 ))p )[2] = 0; + ( (int* ALIGNED( 16 ))p )[3] = 0; +} + +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + ( (int* ALIGNED( 16 ))dst )[0] = ( (const int* ALIGNED( 16 ))src )[0]; + ( (int* ALIGNED( 16 ))dst )[1] = ( (const int* ALIGNED( 16 ))src )[1]; + ( (int* ALIGNED( 16 ))dst )[2] = ( (const int* ALIGNED( 16 ))src )[2]; + ( (int* ALIGNED( 16 ))dst )[3] = ( (const int* ALIGNED( 16 ))src )[3]; +} + +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; - t = ( ( int * ALIGNED(16) ) a )[0]; - - ( ( int * ALIGNED(16) ) a )[0] = ( ( int * ALIGNED(16) ) b )[0]; - ( ( int * ALIGNED(16) ) b )[0] = t; - - t = ( ( int * ALIGNED(16) ) a )[1]; - - ( ( int * ALIGNED(16) ) a )[1] = ( ( int * ALIGNED(16) ) b )[1]; - ( ( int * ALIGNED(16) ) b )[1] = t; - - t = ( ( int * ALIGNED(16) ) a )[2]; - - ( ( int * ALIGNED(16) ) a )[2] = ( ( int * ALIGNED(16) ) b )[2]; - ( ( int * ALIGNED(16) ) b )[2] = t; - - t = ( ( int * ALIGNED(16) ) a )[3]; - - ( ( int * ALIGNED(16) ) a )[3] = ( ( int * ALIGNED(16) ) b )[3]; - ( ( int * ALIGNED(16) ) b )[3] = t; - } - - // v4 transposed memory manipulation functions - - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.i[0] = ( (const int *) a0 )[0]; - a.i[1] = ( (const int *) a1 )[0]; - a.i[2] = ( (const int *) a2 )[0]; - a.i[3] = ( (const int *) a3 )[0]; - } - - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { - a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - - a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - - a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - - a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; - } - - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; - c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - - a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; - c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - - a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; - c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - - a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; - c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; - } - - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; - c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3]; - - a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; - c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3]; - - a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; - c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3]; - - a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; - c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; - d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3]; - } - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (int *) a0 )[0] = a.i[0]; - ( (int *) a1 )[0] = a.i[1]; - ( (int *) a2 )[0] = a.i[2]; - ( (int *) a3 )[0] = a.i[3]; - } - - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { - ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; - - ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; - - ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; - - ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; - } - - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - } - - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; - } - - ////////////// - // v4int class - - class v4int : public v4 - { + t = ( (int* ALIGNED( 16 ))a )[0]; + + ( (int* ALIGNED( 16 ))a )[0] = ( (int* ALIGNED( 16 ))b )[0]; + ( (int* ALIGNED( 16 ))b )[0] = t; + + t = ( (int* ALIGNED( 16 ))a )[1]; + + ( (int* ALIGNED( 16 ))a )[1] = ( (int* ALIGNED( 16 ))b )[1]; + ( (int* ALIGNED( 16 ))b )[1] = t; + + t = ( (int* ALIGNED( 16 ))a )[2]; + + ( (int* ALIGNED( 16 ))a )[2] = ( (int* ALIGNED( 16 ))b )[2]; + ( (int* ALIGNED( 16 ))b )[2] = t; + + t = ( (int* ALIGNED( 16 ))a )[3]; + + ( (int* ALIGNED( 16 ))a )[3] = ( (int* ALIGNED( 16 ))b )[3]; + ( (int* ALIGNED( 16 ))b )[3] = t; +} + +// v4 transposed memory manipulation functions + +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a0 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a1 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a2 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a3 )[1]; +} + +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; +} + +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; +} + +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; +} + +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; +} + +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; +} + +////////////// +// v4int class + +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - i[0] = a.i[0]; - i[1] = a.i[1]; - i[2] = a.i[2]; - i[3] = a.i[3]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - i[0] = a.i[0]; - i[1] = a.i[1]; - i[2] = a.i[2]; - i[3] = a.i[3]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - i[0] = a; - i[1] = a; - i[2] = a; - i[3] = a; + i[0] = a; + i[1] = a; + i[2] = a; + i[3] = a; } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - i[0] = i0; - i[1] = i1; - i[2] = i2; - i[3] = i3; + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) - ASSIGN( =) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) + ASSIGN( = ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) - #undef ASSIGN +#undef ASSIGN // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; + inline int operator()( int n ) { return i[n]; } +}; + +// v4int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ } - }; - - // v4int prefix unary operators - - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - - PREFIX_UNARY(+) - PREFIX_UNARY(-) - - inline v4int operator !( const v4int & a ) - { + +PREFIX_UNARY( +) +PREFIX_UNARY( -) + +inline v4int operator!( const v4int& a ) +{ v4int b; - b.i[0] = - ( ! a.i[0] ); - b.i[1] = - ( ! a.i[1] ); - b.i[2] = - ( ! a.i[2] ); - b.i[3] = - ( ! a.i[3] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); return b; - } - - PREFIX_UNARY(~) - - #undef PREFIX_UNARY - - // v4int prefix increment / decrement - - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - - #undef PREFIX_INCDEC - - // v4int postfix increment / decrement - - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) \ - { \ - v4int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - - #undef POSTFIX_INCDEC - - // v4int binary operators - - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) - BINARY(^) - BINARY(&) - BINARY(|) - - #undef BINARY - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4int miscellaneous functions - - inline v4int abs( const v4int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v4int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v4int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v4int binary operators + +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) +BINARY( ^) +BINARY( & ) +BINARY( | ) + +#undef BINARY + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4int miscellaneous functions + +inline v4int abs( const v4int& a ) +{ v4int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -728,10 +712,10 @@ namespace v4 b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; b.i[0] = a.i[0] & ~c.i[0]; @@ -740,10 +724,10 @@ namespace v4 b.i[3] = a.i[3] & ~c.i[3]; return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.i[0] = a.i[0] & c.i[0]; @@ -752,10 +736,10 @@ namespace v4 b.i[3] = a.i[3] & c.i[3]; return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; tf.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); @@ -764,160 +748,189 @@ namespace v4 tf.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - f[0] = a; - f[1] = a; - f[2] = a; - f[3] = a; + f[0] = a; + f[1] = a; + f[2] = a; + f[3] = a; } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - f[0] = f0; - f[1] = f1; - f[2] = f2; - f[3] = f3; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op) \ - inline v4float &operator op( const v4float &b ) \ - { \ - f[0] op b.f[0]; \ - f[1] op b.f[1]; \ - f[2] op b.f[2]; \ - f[3] op b.f[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + f[0] op b.f[0]; \ + f[1] op b.f[1]; \ + f[2] op b.f[2]; \ + f[3] op b.f[3]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) - #undef ASSIGN +#undef ASSIGN // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.f[0] = +a.f[0]; @@ -926,10 +939,10 @@ namespace v4 b.f[3] = +a.f[3]; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; b.f[0] = -a.f[0]; @@ -938,10 +951,10 @@ namespace v4 b.f[3] = -a.f[3]; return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; b.i[0] = a.i[0] ? 0 : -1; @@ -950,12 +963,12 @@ namespace v4 b.i[3] = a.i[3] ? 0 : -1; return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; b.f[0] = ++a.f[0]; @@ -964,10 +977,10 @@ namespace v4 b.f[3] = ++a.f[3]; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; b.f[0] = --a.f[0]; @@ -976,12 +989,12 @@ namespace v4 b.f[3] = --a.f[3]; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; b.f[0] = a.f[0]++; @@ -990,10 +1003,10 @@ namespace v4 b.f[3] = a.f[3]++; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; b.f[0] = a.f[0]--; @@ -1002,113 +1015,119 @@ namespace v4 b.f[3] = a.f[3]--; return b; - } - - // v4float binary operators - - #define BINARY(op) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = a.f[0] op b.f[0]; \ - c.f[1] = a.f[1] op b.f[1]; \ - c.f[2] = a.f[2] op b.f[2]; \ - c.f[3] = a.f[3] op b.f[3]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - - #undef BINARY - - // v4float logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.f[0] op b.f[0] ); \ - c.i[1] = - ( a.f[1] op b.f[1] ); \ - c.i[2] = - ( a.f[2] op b.f[2] ); \ - c.i[3] = - ( a.f[3] op b.f[3] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(!=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - return b; \ - } - - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - #undef CMATH_FR1 - #undef CMATH_FR2 - - inline v4float copysign( const v4float &a, const v4float &b ) - { +} + +// v4float binary operators + +#define BINARY( op ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = a.f[0] op b.f[0]; \ + c.f[1] = a.f[1] op b.f[1]; \ + c.f[2] = a.f[2] op b.f[2]; \ + c.f[3] = a.f[3] op b.f[3]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v4float logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.f[0] op b.f[0] ); \ + c.i[1] = -( a.f[1] op b.f[1] ); \ + c.i[2] = -( a.f[2] op b.f[2] ); \ + c.i[3] = -( a.f[3] op b.f[3] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( != ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + +#undef CMATH_FR1 +#undef CMATH_FR2 + + inline v4float + copysign( const v4float& a, const v4float& b ) +{ v4float c; float t; t = ::fabs( a.f[0] ); - if ( b.f[0] < 0 ) t = -t; + if ( b.f[0] < 0 ) + t = -t; c.f[0] = t; t = ::fabs( a.f[1] ); - if ( b.f[1] < 0 ) t = -t; + if ( b.f[1] < 0 ) + t = -t; c.f[1] = t; t = ::fabs( a.f[2] ); - if ( b.f[2] < 0 ) t = -t; + if ( b.f[2] < 0 ) + t = -t; c.f[2] = t; t = ::fabs( a.f[3] ); - if ( b.f[3] < 0 ) t = -t; + if ( b.f[3] < 0 ) + t = -t; c.f[3] = t; return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1117,10 +1136,10 @@ namespace v4 b.f[3] = ::sqrt( 1.0f / a.f[3] ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1129,10 +1148,10 @@ namespace v4 b.f[3] = ::sqrt( 1.0f / a.f[3] ); return b; - } +} - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.f[0] = 1.0f / a.f[0]; @@ -1141,10 +1160,10 @@ namespace v4 b.f[3] = 1.0f / a.f[3]; return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; b.f[0] = 1.0f / a.f[0]; @@ -1153,10 +1172,10 @@ namespace v4 b.f[3] = 1.0f / a.f[3]; return b; - } +} - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.f[0] = a.f[0] * b.f[0] + c.f[0]; @@ -1165,10 +1184,10 @@ namespace v4 d.f[3] = a.f[3] * b.f[3] + c.f[3]; return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.f[0] = a.f[0] * b.f[0] - c.f[0]; @@ -1177,10 +1196,10 @@ namespace v4 d.f[3] = a.f[3] * b.f[3] - c.f[3]; return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.f[0] = c.f[0] - a.f[0] * b.f[0]; @@ -1189,10 +1208,10 @@ namespace v4 d.f[3] = c.f[3] - a.f[3] * b.f[3]; return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.i[0] = ( ~m.i[0] ) & a.i[0]; @@ -1201,10 +1220,10 @@ namespace v4 b.i[3] = ( ~m.i[3] ) & a.i[3]; return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.i[0] = m.i[0] | a.i[0]; @@ -1213,10 +1232,10 @@ namespace v4 b.i[3] = m.i[3] | a.i[3]; return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.i[0] = m.i[0] ^ a.i[0]; @@ -1225,40 +1244,37 @@ namespace v4 b.i[3] = m.i[3] ^ a.i[3]; return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ p[0] += a.f[0]; p[1] += a.f[1]; p[2] += a.f[2]; p[3] += a.f[3]; - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ p[0] -= a.f[0]; p[1] -= a.f[1]; p[2] -= a.f[2]; p[3] -= a.f[3]; - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ p[0] *= a.f[0]; p[1] *= a.f[1]; p[2] *= a.f[2]; p[3] *= a.f[3]; - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ float x = wl.f[0], y = wl.f[1], z = wl.f[2]; wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z ); @@ -1270,7 +1286,7 @@ namespace v4 wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z ); wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z ); wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z ); - } +} } // namespace v4 diff --git a/src/util/v4/v4_portable_v1.h b/src/util/v4/v4_portable_v1.h index 9a6cca87..3d95597a 100644 --- a/src/util/v4/v4_portable_v1.h +++ b/src/util/v4/v4_portable_v1.h @@ -11,7 +11,7 @@ #define V4_PORTABLE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif // This does not work with gcc 5.3.1 and the -fopenmp-simd @@ -22,160 +22,150 @@ // #define ALWAYS_VECTORIZE _Pragma( "simd" ) -#define ALWAYS_VECTORIZE \ - _Pragma( "simd" ) \ - _Pragma( "vector aligned" ) +#define ALWAYS_VECTORIZE _Pragma( "simd" ) _Pragma( "vector aligned" ) -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; + union { + int i[4]; + float f[4]; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + i[j] = a.i[j]; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[n]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[n]; return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; b.i[0] = a.i[i0]; @@ -184,1023 +174,1044 @@ namespace v4 b.i[3] = a.i[i3]; return b; - } +} - #define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - sw( a.i[j], b.i[j] ); - } + for ( int j = 0; j < 4; j++ ) + sw( a.i[j], b.i[j] ); +} - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); - sw( a2.i[3],a3.i[2] ); - } +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ + sw( a0.i[1], a1.i[0] ); + sw( a0.i[2], a2.i[0] ); + sw( a0.i[3], a3.i[0] ); + sw( a1.i[2], a2.i[1] ); + sw( a1.i[3], a3.i[1] ); + sw( a2.i[3], a3.i[2] ); +} - #undef sw +#undef sw - // v4 memory manipulation functions +// v4 memory manipulation functions - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - a.i[j] = ((const int * ALIGNED(16))p)[j]; - } + for ( int j = 0; j < 4; j++ ) + a.i[j] = ( (const int* ALIGNED( 16 ))p )[j]; +} - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))p)[j] = a.i[j]; - } + for ( int j = 0; j < 4; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = a.i[j]; +} - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))p)[j] = a.i[j]; - } + for ( int j = 0; j < 4; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = a.i[j]; +} - inline void clear_4x1( void * ALIGNED(16) p ) - { +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))p)[j] = 0; - } + for ( int j = 0; j < 4; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = 0; +} - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j]; - } + for ( int j = 0; j < 4; j++ ) + ( (int* ALIGNED( 16 ))dst )[j] = ( (const int* ALIGNED( 16 ))src )[j]; +} - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) + for ( int j = 0; j < 4; j++ ) { - t = ((int * ALIGNED(16))a)[j]; - ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j]; - ((int * ALIGNED(16))b)[j] = t; + t = ( (int* ALIGNED( 16 ))a )[j]; + ( (int* ALIGNED( 16 ))a )[j] = ( (int* ALIGNED( 16 ))b )[j]; + ( (int* ALIGNED( 16 ))b )[j] = t; } - } - - // v4 transposed memory manipulation functions - - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.i[0] = ( (const int *) a0 )[0]; - a.i[1] = ( (const int *) a1 )[0]; - a.i[2] = ( (const int *) a2 )[0]; - a.i[3] = ( (const int *) a3 )[0]; - } - - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { - a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1]; - - a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1]; - - a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1]; - - a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1]; - } - - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; - c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - - a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; - c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - - a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; - c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - - a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; - c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; - } - - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { - a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0]; - b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1]; - c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2]; - d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3]; - - a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0]; - b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1]; - c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2]; - d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3]; - - a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0]; - b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1]; - c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2]; - d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3]; - - a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0]; - b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1]; - c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2]; - d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3]; - } - - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (int *) a0 )[0] = a.i[0]; - ( (int *) a1 )[0] = a.i[1]; - ( (int *) a2 )[0] = a.i[2]; - ( (int *) a3 )[0] = a.i[3]; - } - - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { - ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0]; - - ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1]; - - ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2]; - - ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3]; - } - - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - } - - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { - ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0]; - ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0]; - ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0]; - ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0]; - - ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1]; - ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1]; - ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1]; - ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1]; - - ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2]; - ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2]; - ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2]; - ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2]; - - ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3]; - ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3]; - ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3]; - ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3]; - } - - ////////////// - // v4int class - - class v4int : public v4 - { +} + +// v4 transposed memory manipulation functions + +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a0 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a1 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a2 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a3 )[1]; +} + +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; +} + +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; +} + +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; +} + +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; +} + +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; +} + +////////////// +// v4int class + +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + i[j] = a.i[j]; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + i[j] = a.i[j]; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - i[j] = a; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + i[j] = a; } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - i[0] = i0; - i[1] = i1; - i[2] = i2; - i[3] = i3; + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - i[j] op b.i[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + i[j] op b.i[j]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) - ASSIGN( =) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) + ASSIGN( = ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) - #undef ASSIGN +#undef ASSIGN // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; - } - }; + inline int operator()( int n ) { return i[n]; } +}; - // v4int prefix unary operators +// v4int prefix unary operators - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int & a ) \ - { \ - v4int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } - PREFIX_UNARY(+) - PREFIX_UNARY(-) +PREFIX_UNARY( +) +PREFIX_UNARY( -) - inline v4int operator !( const v4int & a ) - { +inline v4int operator!( const v4int& a ) +{ v4int b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = - ( !a.i[j] ); + for ( int j = 0; j < 4; j++ ) + b.i[j] = -( !a.i[j] ); return b; - } - - PREFIX_UNARY(~) - - #undef PREFIX_UNARY - - // v4int prefix increment / decrement - - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int & a ) \ - { \ - v4int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - - #undef PREFIX_INCDEC - - // v4int postfix increment / decrement - - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int & a, int ) \ - { \ - v4int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.i[j] = ( a.i[j] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - - #undef POSTFIX_INCDEC - - // v4int binary operators - - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.i[j] = a.i[j] op b.i[j]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) - BINARY(^) - BINARY(&) - BINARY(|) - - #undef BINARY - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.i[j] = - ( a.i[j] op b.i[j] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4int miscellaneous functions - - inline v4int abs( const v4int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v4int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v4int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.i[j] = ( a.i[j] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v4int binary operators + +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.i[j] = a.i[j] op b.i[j]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) +BINARY( ^) +BINARY( & ) +BINARY( | ) + +#undef BINARY + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.i[j] = -( a.i[j] op b.i[j] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4int miscellaneous functions + +inline v4int abs( const v4int& a ) +{ v4int b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] & ~c.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] & c.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] & c.i[j]; return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + for ( int j = 0; j < 4; j++ ) + tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + f[j] = a.f[j]; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - f[j] = a.f[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + f[j] = a.f[j]; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - f[j] = a; + ALWAYS_VECTORIZE + for ( int j = 0; j < 4; j++ ) + f[j] = a; } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - f[0] = f0; - f[1] = f1; - f[2] = f2; - f[3] = f3; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op) \ - inline v4float &operator op( const v4float &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - f[j] op b.f[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + f[j] op b.f[j]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) - #undef ASSIGN +#undef ASSIGN // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = +a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = +a.f[j]; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = -a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = -a.f[j]; return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = a.i[j] ? 0 : -1; + for ( int j = 0; j < 4; j++ ) + b.i[j] = a.i[j] ? 0 : -1; return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ++a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = ++a.f[j]; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = --a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = --a.f[j]; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = a.f[j]++; + for ( int j = 0; j < 4; j++ ) + b.f[j] = a.f[j]++; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = a.f[j]--; + for ( int j = 0; j < 4; j++ ) + b.f[j] = a.f[j]--; return b; - } - - // v4float binary operators - - #define BINARY(op) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.f[j] = a.f[j] op b.f[j]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - - #undef BINARY - - // v4float logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.i[j] = - ( a.f[j] op b.f[j] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(!=) - LOGICAL(&&) - LOGICAL(||) - - #undef LOGICAL - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - b.f[j] = ::fn( a.f[j] ); \ - return b; \ - } - - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 4; j++ ) \ - c.f[j] = ::fn( a.f[j], b.f[j] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - #undef CMATH_FR1 - #undef CMATH_FR2 - - inline v4float copysign( const v4float &a, const v4float &b ) - { +} + +// v4float binary operators + +#define BINARY( op ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.f[j] = a.f[j] op b.f[j]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v4float logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.i[j] = -( a.f[j] op b.f[j] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( != ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + b.f[j] = ::fn( a.f[j] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 4; j++ ) \ + c.f[j] = ::fn( a.f[j], b.f[j] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + +#undef CMATH_FR1 +#undef CMATH_FR2 + + inline v4float + copysign( const v4float& a, const v4float& b ) +{ v4float c; float t; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) + for ( int j = 0; j < 4; j++ ) { - t = ::fabs( a.f[j] ); - if( b.f[j] < 0 ) t = -t; - c.f[j] = t; + t = ::fabs( a.f[j] ); + if ( b.f[j] < 0 ) + t = -t; + c.f[j] = t; } return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + for ( int j = 0; j < 4; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + for ( int j = 0; j < 4; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = 1.0f / a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.f[j] = 1.0f / a.f[j]; + for ( int j = 0; j < 4; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - d.f[j] = a.f[j] * b.f[j] + c.f[j]; + for ( int j = 0; j < 4; j++ ) + d.f[j] = a.f[j] * b.f[j] + c.f[j]; return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - d.f[j] = a.f[j] * b.f[j] - c.f[j]; + for ( int j = 0; j < 4; j++ ) + d.f[j] = a.f[j] * b.f[j] - c.f[j]; return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - d.f[j] = c.f[j] - a.f[j] * b.f[j]; + for ( int j = 0; j < 4; j++ ) + d.f[j] = c.f[j] - a.f[j] * b.f[j]; return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = ( ~m.i[j] ) & a.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = ( ~m.i[j] ) & a.i[j]; return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = m.i[j] | a.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = m.i[j] | a.i[j]; return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - b.i[j] = m.i[j] ^ a.i[j]; + for ( int j = 0; j < 4; j++ ) + b.i[j] = m.i[j] ^ a.i[j]; return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - p[j] += a.f[j]; - } + for ( int j = 0; j < 4; j++ ) + p[j] += a.f[j]; +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - p[j] -= a.f[j]; - } + for ( int j = 0; j < 4; j++ ) + p[j] -= a.f[j]; +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 4; j++ ) - p[j] *= a.f[j]; - } - - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { + for ( int j = 0; j < 4; j++ ) + p[j] *= a.f[j]; +} + +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ float x = wl.f[0], y = wl.f[1], z = wl.f[2]; wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z ); @@ -1212,7 +1223,7 @@ namespace v4 wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z ); wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z ); wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z ); - } +} } // namespace v4 diff --git a/src/util/v4/v4_sse.h b/src/util/v4/v4_sse.h index 5f9e7d9d..75fc0388 100644 --- a/src/util/v4/v4_sse.h +++ b/src/util/v4/v4_sse.h @@ -5,194 +5,186 @@ #error "Do not include v4_sse.h directly; use v4.h" #endif -#include #include +#include #define V4_ACCELERATION #define V4_SSE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v4 { - class v4; - class v4int; - class v4float; +class v4; +class v4int; +class v4float; - template - struct permute - { - constexpr static int value = i0 + i1*4 + i2*16 + i3*64; - }; +template +struct permute +{ + constexpr static int value = i0 + i1 * 4 + i2 * 16 + i3 * 64; +}; - #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64) +#define PERM( i0, i1, i2, i3 ) ( ( i0 ) + (i1)*4 + (i2)*16 + (i3)*64 ) - //////////////// - // v4 base class +//////////////// +// v4 base class - class v4 - { +class v4 +{ friend class v4int; friend class v4float; // v4 miscellaneous friends - friend inline int any( const v4 &a ) ALWAYS_INLINE; - friend inline int all( const v4 &a ) ALWAYS_INLINE; + friend inline int any( const v4& a ) ALWAYS_INLINE; + friend inline int all( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 splat( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 splat( const v4& a ) ALWAYS_INLINE; - template - friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE; + template + friend inline v4 shuffle( const v4& a ) ALWAYS_INLINE; - friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE; - friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE; + friend inline void swap( v4& a, v4& b ) ALWAYS_INLINE; + friend inline void transpose( v4& a0, v4& a1, v4& a2, + v4& a3 ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& a, + const v4& b ) ALWAYS_INLINE; // v4 memory manipulation friends - friend inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) ALWAYS_INLINE; + friend inline void load_4x1( const void* ALIGNED( 16 ) p, + v4& a ) ALWAYS_INLINE; - friend inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void store_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) ALWAYS_INLINE; + friend inline void stream_4x1( const v4& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; - friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; + friend inline void clear_4x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; - friend inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; + friend inline void copy_4x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; - friend inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void swap_4x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v4 transposed memory manipulation friends - friend inline void load_4x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - v4 &a ) ALWAYS_INLINE; - - friend inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, v4 &b ) ALWAYS_INLINE; - - friend inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE; - - friend inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE; - - friend inline void store_4x1_tr( const v4 &a, - void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE; - - friend inline void store_4x2_tr( const v4 &a, const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) ALWAYS_INLINE; - - friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; - - friend inline void store_4x4_tr( const v4 &a, const v4 &b, - const v4 &c, const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) ALWAYS_INLINE; + friend inline void load_4x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + v4& a ) ALWAYS_INLINE; + + friend inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, + v4& b ) ALWAYS_INLINE; + + friend inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c ) ALWAYS_INLINE; + + friend inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, + v4& c, v4& d ) ALWAYS_INLINE; + + friend inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, + void* a3 ) ALWAYS_INLINE; + + friend inline void store_4x2_tr( const v4& a, const v4& b, + void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, + void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; + + friend inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, + const v4& d, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3 ) ALWAYS_INLINE; protected: - - union - { - int i[4]; - float f[4]; - __m128 v; + union { + int i[4]; + float f[4]; + __m128 v; }; public: + v4() {} // Default constructor - v4() {} // Default constructor - - v4( const v4 &a ) // Copy constructor + v4( const v4& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v4() {} // Default destructor - }; + ~v4() {} // Default destructor +}; - // v4 miscellaneous functions +// v4 miscellaneous functions - inline int any( const v4 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3]; - } +inline int any( const v4& a ) { return a.i[0] || a.i[1] || a.i[2] || a.i[3]; } - inline int all( const v4 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3]; - } +inline int all( const v4& a ) { return a.i[0] && a.i[1] && a.i[2] && a.i[3]; } - template - inline v4 splat( const v4 & a ) - { +template +inline v4 splat( const v4& a ) +{ v4 b; - b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1, 1, 1, 1>::value ) ); return b; - } +} - template - inline v4 shuffle( const v4 & a ) - { +template +inline v4 shuffle( const v4& a ) +{ v4 b; - b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); + b.v = _mm_shuffle_ps( a.v, a.v, ( permute::value ) ); return b; - } +} - inline void swap( v4 &a, v4 &b ) - { +inline void swap( v4& a, v4& b ) +{ __m128 t = a.v; a.v = b.v; b.v = t; - } +} - inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) - { +inline void transpose( v4& a0, v4& a1, v4& a2, v4& a3 ) +{ __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u; - t = _mm_unpackhi_ps( a0_v, a1_v ); + t = _mm_unpackhi_ps( a0_v, a1_v ); a0_v = _mm_unpacklo_ps( a0_v, a1_v ); - u = _mm_unpackhi_ps( a2_v, a3_v ); + u = _mm_unpackhi_ps( a2_v, a3_v ); a2_v = _mm_unpacklo_ps( a2_v, a3_v ); a1_v = _mm_movehl_ps( a2_v, a0_v ); @@ -204,103 +196,88 @@ namespace v4 a1.v = a1_v; a2.v = a2_v; a3.v = a3_v; - } +} - // v4 memory manipulation functions +// v4 memory manipulation functions - inline void load_4x1( const void * ALIGNED(16) p, - v4 &a ) - { - a.v = _mm_load_ps( ( float * ) p ); - } - - inline void store_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - _mm_store_ps( ( float * ) p, a.v ); - } +inline void load_4x1( const void* ALIGNED( 16 ) p, v4& a ) +{ + a.v = _mm_load_ps( (float*)p ); +} - inline void stream_4x1( const v4 &a, - void * ALIGNED(16) p ) - { - _mm_stream_ps( ( float * ) p, a.v ); - } +inline void store_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + _mm_store_ps( (float*)p, a.v ); +} - inline void clear_4x1( void * ALIGNED(16) p ) - { - _mm_store_ps( ( float * ) p, _mm_setzero_ps() ); - } +inline void stream_4x1( const v4& a, void* ALIGNED( 16 ) p ) +{ + _mm_stream_ps( (float*)p, a.v ); +} - inline void copy_4x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) ); - } +inline void clear_4x1( void* ALIGNED( 16 ) p ) +{ + _mm_store_ps( (float*)p, _mm_setzero_ps() ); +} - inline void swap_4x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { - __m128 t = _mm_load_ps( ( float * ) a ); +inline void copy_4x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + _mm_store_ps( (float*)dst, _mm_load_ps( (const float*)src ) ); +} - _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) ); - _mm_store_ps( ( float * ) b, t ); - } +inline void swap_4x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ + __m128 t = _mm_load_ps( (float*)a ); - // v4 transposed memory manipulation functions + _mm_store_ps( (float*)a, _mm_load_ps( (float*)b ) ); + _mm_store_ps( (float*)b, t ); +} - inline void load_4x1_tr( const void *a0, - const void *a1, - const void *a2, - const void *a3, - v4 &a ) - { - a.v = _mm_setr_ps( ( (const float *) a0 )[0], - ( (const float *) a1 )[0], - ( (const float *) a2 )[0], - ( (const float *) a3 )[0] ); - } +// v4 transposed memory manipulation functions - inline void load_4x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - v4 &a, - v4 &b ) - { +inline void load_4x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, v4& a ) +{ + a.v = _mm_setr_ps( ( (const float*)a0 )[0], ( (const float*)a1 )[0], + ( (const float*)a2 )[0], ( (const float*)a3 )[0] ); +} + +inline void load_4x2_tr( const void* ALIGNED( 8 ) a0, + const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, + const void* ALIGNED( 8 ) a3, v4& a, v4& b ) +{ __m128 a_v, b_v, t; b_v = _mm_setzero_ps(); - t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 ); - b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 ); + t = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64*)a0 ), (__m64*)a1 ); + b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64*)a2 ), (__m64*)a3 ); a_v = _mm_shuffle_ps( t, b_v, 0x88 ); b_v = _mm_shuffle_ps( t, b_v, 0xdd ); a.v = a_v; b.v = b_v; - } +} - inline void load_4x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c ) - { +inline void load_4x3_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c ) +{ __m128 a_v, b_v, c_v, t, u; - t = _mm_load_ps( (const float *) a0 ); - b_v = _mm_load_ps( (const float *) a1 ); - c_v = _mm_load_ps( (const float *) a2 ); - u = _mm_load_ps( (const float *) a3 ); + t = _mm_load_ps( (const float*)a0 ); + b_v = _mm_load_ps( (const float*)a1 ); + c_v = _mm_load_ps( (const float*)a2 ); + u = _mm_load_ps( (const float*)a3 ); a_v = _mm_unpacklo_ps( t, b_v ); b_v = _mm_unpackhi_ps( t, b_v ); - t = _mm_unpacklo_ps( c_v, u ); - u = _mm_unpackhi_ps( c_v, u ); + t = _mm_unpacklo_ps( c_v, u ); + u = _mm_unpackhi_ps( c_v, u ); c_v = _mm_movelh_ps( b_v, u ); b_v = _mm_movehl_ps( t, a_v ); @@ -309,28 +286,25 @@ namespace v4 a.v = a_v; b.v = b_v; c.v = c_v; - } +} - inline void load_4x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - v4 &a, - v4 &b, - v4 &c, - v4 &d ) - { +inline void load_4x4_tr( const void* ALIGNED( 16 ) a0, + const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, + const void* ALIGNED( 16 ) a3, v4& a, v4& b, v4& c, + v4& d ) +{ __m128 a_v, b_v, c_v, d_v, t, u; - a_v = _mm_load_ps( (const float *) a0 ); - b_v = _mm_load_ps( (const float *) a1 ); - c_v = _mm_load_ps( (const float *) a2 ); - d_v = _mm_load_ps( (const float *) a3 ); + a_v = _mm_load_ps( (const float*)a0 ); + b_v = _mm_load_ps( (const float*)a1 ); + c_v = _mm_load_ps( (const float*)a2 ); + d_v = _mm_load_ps( (const float*)a3 ); - t = _mm_unpackhi_ps( a_v, b_v ); + t = _mm_unpackhi_ps( a_v, b_v ); a_v = _mm_unpacklo_ps( a_v, b_v ); - u = _mm_unpackhi_ps( c_v, d_v ); + u = _mm_unpackhi_ps( c_v, d_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); b_v = _mm_movehl_ps( c_v, a_v ); @@ -342,80 +316,64 @@ namespace v4 b.v = b_v; c.v = c_v; d.v = d_v; - } +} - inline void store_4x1_tr( const v4 &a, - void *a0, - void *a1, - void *a2, - void *a3 ) - { - ( (float *) a0 )[0] = a.f[0]; - ( (float *) a1 )[0] = a.f[1]; - ( (float *) a2 )[0] = a.f[2]; - ( (float *) a3 )[0] = a.f[3]; - } - - inline void store_4x2_tr( const v4 &a, - const v4 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3 ) - { +inline void store_4x1_tr( const v4& a, void* a0, void* a1, void* a2, void* a3 ) +{ + ( (float*)a0 )[0] = a.f[0]; + ( (float*)a1 )[0] = a.f[1]; + ( (float*)a2 )[0] = a.f[2]; + ( (float*)a3 )[0] = a.f[3]; +} + +inline void store_4x2_tr( const v4& a, const v4& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3 ) +{ __m128 a_v = a.v, b_v = b.v, t; - t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64*)a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64*)a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 - } + _mm_storel_pi( (__m64*)a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64*)a3, t ); // a3 b3 -> a3 +} - inline void store_4x3_tr( const v4 &a, - const v4 &b, - const v4 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x3_tr( const v4& a, const v4& b, const v4& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ __m128 a_v = a.v, b_v = b.v, t; - t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t + t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t - _mm_storel_pi( (__m64 *) a0, t ); // a0 b0 -> a0 - _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1 -> a1 + _mm_storel_pi( (__m64*)a0, t ); // a0 b0 -> a0 + _mm_storeh_pi( (__m64*)a1, t ); // a1 b1 -> a1 - t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t + t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t - _mm_storel_pi( (__m64 *) a2, t ); // a2 b2 -> a2 - _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3 -> a3 + _mm_storel_pi( (__m64*)a2, t ); // a2 b2 -> a2 + _mm_storeh_pi( (__m64*)a3, t ); // a3 b3 -> a3 - ( (float *) a0 )[2] = c.f[0]; - ( (float *) a1 )[2] = c.f[1]; - ( (float *) a2 )[2] = c.f[2]; - ( (float *) a3 )[2] = c.f[3]; - } + ( (float*)a0 )[2] = c.f[0]; + ( (float*)a1 )[2] = c.f[1]; + ( (float*)a2 )[2] = c.f[2]; + ( (float*)a3 )[2] = c.f[3]; +} - inline void store_4x4_tr( const v4 &a, - const v4 &b, - const v4 &c, - const v4 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3 ) - { +inline void store_4x4_tr( const v4& a, const v4& b, const v4& c, const v4& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3 ) +{ __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u; - t = _mm_unpackhi_ps( a_v, b_v ); + t = _mm_unpackhi_ps( a_v, b_v ); a_v = _mm_unpacklo_ps( a_v, b_v ); - u = _mm_unpackhi_ps( c_v, d_v ); + u = _mm_unpackhi_ps( c_v, d_v ); c_v = _mm_unpacklo_ps( c_v, d_v ); b_v = _mm_movehl_ps( c_v, a_v ); @@ -423,241 +381,259 @@ namespace v4 c_v = _mm_movelh_ps( t, u ); d_v = _mm_movehl_ps( u, t ); - _mm_store_ps( (float *) a0, a_v ); - _mm_store_ps( (float *) a1, b_v ); - _mm_store_ps( (float *) a2, c_v ); - _mm_store_ps( (float *) a3, d_v ); - } + _mm_store_ps( (float*)a0, a_v ); + _mm_store_ps( (float*)a1, b_v ); + _mm_store_ps( (float*)a2, c_v ); + _mm_store_ps( (float*)a3, d_v ); +} - ////////////// - // v4int class +////////////// +// v4int class - class v4int : public v4 - { +class v4int : public v4 +{ // v4int prefix unary operator friends - friend inline v4int operator +( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator ~( const v4int & a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4int & a ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator~( const v4int& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4int prefix increment / decrement operator friends - friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a ) ALWAYS_INLINE; // v4int postfix increment / decrement operator friends - friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE; - friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE; + friend inline v4int operator++( v4int& a, int ) ALWAYS_INLINE; + friend inline v4int operator--( v4int& a, int ) ALWAYS_INLINE; // v4int binary operator friends - friend inline v4int operator +( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator -( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator *( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator /( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator %( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ^( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator |( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator+( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator-( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator*(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator/( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator%( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator^( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&(const v4int& a, const v4int& b)ALWAYS_INLINE; + friend inline v4int operator|( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int logical operator friends - friend inline v4int operator <( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4int& a, + const v4int& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4int& a, + const v4int& b ) ALWAYS_INLINE; // v4int miscellaneous friends - friend inline v4int abs( const v4int &a ) ALWAYS_INLINE; - friend inline v4 czero( const v4int &c, const v4 &a ) ALWAYS_INLINE; - friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE; + friend inline v4int abs( const v4int& a ) ALWAYS_INLINE; + friend inline v4 czero( const v4int& c, const v4& a ) ALWAYS_INLINE; + friend inline v4 notczero( const v4int& c, const v4& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE; + friend inline v4 merge( const v4int& c, const v4& t, + const v4& f ) ALWAYS_INLINE; // v4float unary operator friends - friend inline v4int operator !( const v4float & a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float miscellaneous friends - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; public: - // v4int constructors / destructors - v4int() {} // Default constructor + v4int() {} // Default constructor - v4int( const v4int &a ) // Copy constructor + v4int( const v4int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4int( const v4 &a ) // Init from mixed + v4int( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4int( int a ) // Init from scalar + v4int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; + union { + int i; + float f; + } u; - u.i = a; - v = _mm_set1_ps( u.f ); + u.i = a; + v = _mm_set1_ps( u.f ); } - v4int( int i0, int i1, int i2, int i3 ) // Init from scalars + v4int( int i0, int i1, int i2, int i3 ) // Init from scalars { - union - { - int i; - float f; - } u0, u1, u2, u3; + union { + int i; + float f; + } u0, u1, u2, u3; - u0.i = i0; - u1.i = i1; - u2.i = i2; - u3.i = i3; + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; - v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); + v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f ); } - ~v4int() {} // Destructor + ~v4int() {} // Destructor // v4int assignment operators - #define ASSIGN(op) \ - inline v4int &operator op( const v4int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v4int& operator op( const v4int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + return *this; \ } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( <<= ) + ASSIGN( >>= ) - #undef ASSIGN +#undef ASSIGN - inline v4int &operator =( const v4int &b ) + inline v4int& operator=( const v4int& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } - inline v4int &operator ^=( const v4int &b ) + inline v4int& operator^=( const v4int& b ) { - v = _mm_xor_ps( v, b.v ); + v = _mm_xor_ps( v, b.v ); - return *this; + return *this; } - inline v4int &operator &=( const v4int &b ) + inline v4int& operator&=( const v4int& b ) { - v = _mm_and_ps( v, b.v ); + v = _mm_and_ps( v, b.v ); - return *this; + return *this; } - inline v4int &operator |=( const v4int &b ) + inline v4int& operator|=( const v4int& b ) { - v = _mm_or_ps( v, b.v ); + v = _mm_or_ps( v, b.v ); - return *this; + return *this; } // v4int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; + inline int operator()( int n ) { return i[n]; } +}; + +// v4int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v4int operator op( const v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ } - }; - - // v4int prefix unary operators - - #define PREFIX_UNARY(op) \ - inline v4int operator op( const v4int &a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } - inline v4int operator +( const v4int &a ) - { +inline v4int operator+( const v4int& a ) +{ v4int b; b.v = a.v; return b; - } +} - PREFIX_UNARY(-) +PREFIX_UNARY( -) - inline v4int operator !( const v4int &a ) - { +inline v4int operator!( const v4int& a ) +{ v4int b; - b.i[0] = - ( ! a.i[0] ); - b.i[1] = - ( ! a.i[1] ); - b.i[2] = - ( ! a.i[2] ); - b.i[3] = - ( ! a.i[3] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); return b; - } +} - inline v4int operator ~( const v4int &a ) - { +inline v4int operator~( const v4int& a ) +{ v4int b; - union - { - int i; - float f; + union { + int i; + float f; } u; u.i = -1; @@ -665,124 +641,124 @@ namespace v4 b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) ); return b; - } +} - #undef PREFIX_UNARY +#undef PREFIX_UNARY - // v4int prefix increment / decrement +// v4int prefix increment / decrement - #define PREFIX_INCDEC(op) \ - inline v4int operator op( v4int &a ) \ - { \ - v4int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - return b; \ - } +#define PREFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a ) \ + { \ + v4int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + return b; \ + } - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) - #undef PREFIX_INCDEC +#undef PREFIX_INCDEC - // v4int postfix increment / decrement +// v4int postfix increment / decrement - #define POSTFIX_INCDEC(op) \ - inline v4int operator op( v4int &a, int ) \ - { \ - v4int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - return b; \ - } +#define POSTFIX_INCDEC( op ) \ + inline v4int operator op( v4int& a, int ) \ + { \ + v4int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + return b; \ + } - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) - #undef POSTFIX_INCDEC +#undef POSTFIX_INCDEC - // v4int binary operators +// v4int binary operators - #define BINARY(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - return c; \ - } +#define BINARY( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + return c; \ + } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(<<) - BINARY(>>) +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( << ) +BINARY( >> ) - #undef BINARY +#undef BINARY - inline v4int operator ^( const v4int &a, const v4int &b ) - { +inline v4int operator^( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_xor_ps( a.v, b.v ); return c; - } +} - inline v4int operator &( const v4int &a, const v4int &b ) - { +inline v4int operator&( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_and_ps( a.v, b.v ); return c; - } +} - inline v4int operator |( const v4int &a, const v4int &b ) - { +inline v4int operator|( const v4int& a, const v4int& b ) +{ v4int c; c.v = _mm_or_ps( a.v, b.v ); return c; - } - - // v4int logical operators - - #define LOGICAL(op) \ - inline v4int operator op( const v4int &a, const v4int &b ) \ - { \ - v4int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - return c; \ - } +} + +// v4int logical operators + +#define LOGICAL( op ) \ + inline v4int operator op( const v4int& a, const v4int& b ) \ + { \ + v4int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + return c; \ + } - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) - #undef LOGICAL +#undef LOGICAL - // v4int miscellaneous functions +// v4int miscellaneous functions - inline v4int abs( const v4int &a ) - { +inline v4int abs( const v4int& a ) +{ v4int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -791,148 +767,182 @@ namespace v4 b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3]; return b; - } +} - inline v4 czero( const v4int &c, const v4 &a ) - { +inline v4 czero( const v4int& c, const v4& a ) +{ v4 b; b.v = _mm_andnot_ps( c.v, a.v ); return b; - } +} - inline v4 notczero( const v4int &c, const v4 &a ) - { +inline v4 notczero( const v4int& c, const v4& a ) +{ v4 b; b.v = _mm_and_ps( c.v, a.v ); return b; - } +} - inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) - { +inline v4 merge( const v4int& c, const v4& t, const v4& f ) +{ v4 tf; __m128 c_v = c.v; - tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), - _mm_and_ps( c_v, t.v ) ); + tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ), _mm_and_ps( c_v, t.v ) ); return tf; - } +} - //////////////// - // v4float class +//////////////// +// v4float class - class v4float : public v4 - { +class v4float : public v4 +{ // v4float prefix unary operator friends - friend inline v4float operator +( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a ) ALWAYS_INLINE; - friend inline v4float operator ~( const v4float &a ) ALWAYS_INLINE; - friend inline v4int operator !( const v4float &a ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a ) ALWAYS_INLINE; + friend inline v4float operator~( const v4float& a ) ALWAYS_INLINE; + friend inline v4int operator!( const v4float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v4float prefix increment / decrement operator friends - friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a ) ALWAYS_INLINE; // v4float postfix increment / decrement operator friends - friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE; - friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE; + friend inline v4float operator++( v4float& a, int ) ALWAYS_INLINE; + friend inline v4float operator--( v4float& a, int ) ALWAYS_INLINE; // v4float binary operator friends - friend inline v4float operator +( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator -( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator *( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4float operator /( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4float operator+( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator-( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4float operator*(const v4float& a, + const v4float& b)ALWAYS_INLINE; + friend inline v4float operator/( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float logical operator friends - friend inline v4int operator <( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE; - friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE; + friend inline v4int operator<( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator==( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator!=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator<=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator>=( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator&&( const v4float& a, + const v4float& b ) ALWAYS_INLINE; + friend inline v4int operator||( const v4float& a, + const v4float& b ) ALWAYS_INLINE; // v4float math library friends - #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE - #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a, \ - const v4float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - - #undef CMATH_FR1 - #undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v4float fn( const v4float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v4float fn( const v4float& a, const v4float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v4float miscellaneous friends - friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rsqrt ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE; - friend inline v4float rcp ( const v4float &a ) ALWAYS_INLINE; - friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE; - friend inline v4float clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE; - friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE; - friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE; + friend inline v4float rsqrt_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rsqrt( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp_approx( const v4float& a ) ALWAYS_INLINE; + friend inline v4float rcp( const v4float& a ) ALWAYS_INLINE; + friend inline v4float fma( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float fnms( const v4float& a, const v4float& b, + const v4float& c ) ALWAYS_INLINE; + friend inline v4float clear_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float set_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline v4float toggle_bits( const v4int& m, + const v4float& a ) ALWAYS_INLINE; + friend inline void increment_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void decrement_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void scale_4x1( float* ALIGNED( 16 ) p, + const v4float& a ) ALWAYS_INLINE; + friend inline void trilinear( v4float& wl, v4float& wh ) ALWAYS_INLINE; public: - // v4float constructors / destructors - v4float() {} // Default constructor + v4float() {} // Default constructor - v4float( const v4float &a ) // Copy constructor + v4float( const v4float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v4float( const v4 &a ) // Init from mixed + v4float( const v4& a ) // Init from mixed { - v = a.v; + v = a.v; } - v4float( float a ) // Init from scalar + v4float( float a ) // Init from scalar { - v = _mm_set1_ps( a ); + v = _mm_set1_ps( a ); } - v4float( float f0, float f1, float f2, float f3 ) // Init from scalars + v4float( float f0, float f1, float f2, float f3 ) // Init from scalars { - v = _mm_setr_ps( f0, f1, f2, f3 ); + v = _mm_setr_ps( f0, f1, f2, f3 ); } - ~v4float() {} // Destructor + ~v4float() {} // Destructor // v4float assignment operators - #define ASSIGN(op,intrin) \ - inline v4float &operator op( const v4float &b ) \ - { \ - v = intrin( v, b.v ); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v4float& operator op( const v4float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } ASSIGN( +=, _mm_add_ps ) @@ -940,61 +950,55 @@ namespace v4 ASSIGN( *=, _mm_mul_ps ) ASSIGN( /=, _mm_div_ps ) - #undef ASSIGN +#undef ASSIGN - inline v4float &operator =( const v4float &b ) + inline v4float& operator=( const v4float& b ) { - v = b.v; + v = b.v; - return *this; + return *this; } // v4float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v4float prefix unary operators +// v4float prefix unary operators - inline v4float operator +( const v4float &a ) - { +inline v4float operator+( const v4float& a ) +{ v4float b; b.v = a.v; return b; - } +} - inline v4float operator -( const v4float &a ) - { +inline v4float operator-( const v4float& a ) +{ v4float b; b.v = _mm_sub_ps( _mm_setzero_ps(), a.v ); return b; - } +} - inline v4int operator !( const v4float &a ) - { +inline v4int operator!( const v4float& a ) +{ v4int b; b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v ); return b; - } +} - // v4float prefix increment / decrement operators +// v4float prefix increment / decrement operators - inline v4float operator ++( v4float &a ) - { +inline v4float operator++( v4float& a ) +{ v4float b; __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1003,10 +1007,10 @@ namespace v4 b.v = t; return b; - } +} - inline v4float operator --( v4float &a ) - { +inline v4float operator--( v4float& a ) +{ v4float b; __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) ); @@ -1015,12 +1019,12 @@ namespace v4 b.v = t; return b; - } +} - // v4float postfix increment / decrement operators +// v4float postfix increment / decrement operators - inline v4float operator ++( v4float &a, int ) - { +inline v4float operator++( v4float& a, int ) +{ v4float b; __m128 a_v = a.v; @@ -1029,10 +1033,10 @@ namespace v4 b.v = a_v; return b; - } +} - inline v4float operator --( v4float &a, int ) - { +inline v4float operator--( v4float& a, int ) +{ v4float b; __m128 a_v = a.v; @@ -1041,165 +1045,161 @@ namespace v4 b.v = a_v; return b; - } +} - // v4float binary operators +// v4float binary operators - #define BINARY(op,intrin) \ - inline v4float operator op( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define BINARY( op, intrin ) \ + inline v4float operator op( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - BINARY( +, _mm_add_ps ) - BINARY( -, _mm_sub_ps ) - BINARY( *, _mm_mul_ps ) - BINARY( /, _mm_div_ps ) +BINARY( +, _mm_add_ps ) +BINARY( -, _mm_sub_ps ) +BINARY( *, _mm_mul_ps ) +BINARY( /, _mm_div_ps ) - #undef BINARY +#undef BINARY - // v4float logical operators +// v4float logical operators - #define LOGICAL(op,intrin) \ - inline v4int operator op( const v4float &a, const v4float &b ) \ - { \ - v4int c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define LOGICAL( op, intrin ) \ + inline v4int operator op( const v4float& a, const v4float& b ) \ + { \ + v4int c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - LOGICAL( <, _mm_cmplt_ps ) - LOGICAL( >, _mm_cmpgt_ps ) - LOGICAL( ==, _mm_cmpeq_ps ) - LOGICAL( <=, _mm_cmple_ps ) - LOGICAL( >=, _mm_cmpge_ps ) - LOGICAL( !=, _mm_cmpneq_ps ) +LOGICAL( <, _mm_cmplt_ps ) +LOGICAL( >, _mm_cmpgt_ps ) +LOGICAL( ==, _mm_cmpeq_ps ) +LOGICAL( <=, _mm_cmple_ps ) +LOGICAL( >=, _mm_cmpge_ps ) +LOGICAL( !=, _mm_cmpneq_ps ) - #undef LOGICAL +#undef LOGICAL - inline v4int operator &&( const v4float &a, const v4float &b ) - { +inline v4int operator&&( const v4float& a, const v4float& b ) +{ v4int c; __m128 vzero = _mm_setzero_ps(); - c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + c.v = + _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ), _mm_cmpneq_ps( b.v, vzero ) ); return c; - } +} - inline v4int operator ||( const v4float &a, const v4float &b ) - { +inline v4int operator||( const v4float& a, const v4float& b ) +{ v4int c; __m128 vzero = _mm_setzero_ps(); - c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), - _mm_cmpneq_ps( b.v, vzero ) ); + c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ), _mm_cmpneq_ps( b.v, vzero ) ); return c; - } - - // v4float math library functions - - #define CMATH_FR1(fn) \ - inline v4float fn( const v4float &a ) \ - { \ - v4float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - return b; \ - } +} + +// v4float math library functions + +#define CMATH_FR1( fn ) \ + inline v4float fn( const v4float& a ) \ + { \ + v4float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + return b; \ + } - #define CMATH_FR2(fn) \ - inline v4float fn( const v4float &a, const v4float &b ) \ - { \ - v4float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v4float fn( const v4float& a, const v4float& b ) \ + { \ + v4float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) - #undef CMATH_FR1 - #undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - inline v4float fabs( const v4float &a ) - { + inline v4float fabs( const v4float& a ) +{ v4float b; b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v ); return b; - } +} - inline v4float sqrt( const v4float &a ) - { +inline v4float sqrt( const v4float& a ) +{ v4float b; b.v = _mm_sqrt_ps( a.v ); return b; - } +} - inline v4float copysign( const v4float &a, const v4float &b ) - { +inline v4float copysign( const v4float& a, const v4float& b ) +{ v4float c; __m128 t = _mm_set1_ps( -0.0f ); - c.v = _mm_or_ps( _mm_and_ps( t, b.v ), - _mm_andnot_ps( t, a.v ) ); + c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) ); return c; - } +} - // v4float miscellaneous functions +// v4float miscellaneous functions - inline v4float rsqrt_approx( const v4float &a ) - { +inline v4float rsqrt_approx( const v4float& a ) +{ v4float b; b.v = _mm_rsqrt_ps( a.v ); return b; - } +} - inline v4float rsqrt( const v4float &a ) - { +inline v4float rsqrt( const v4float& a ) +{ v4float b; __m128 a_v = a.v, b_v; b_v = _mm_rsqrt_ps( a_v ); - b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ), - _mm_sub_ps( b_v, - _mm_mul_ps( a_v, - _mm_mul_ps( b_v, - _mm_mul_ps( b_v, b_v ) - ) - ) - ) - ) - ); + b.v = _mm_add_ps( + b_v, + _mm_mul_ps( + _mm_set1_ps( 0.5f ), + _mm_sub_ps( + b_v, _mm_mul_ps( + a_v, _mm_mul_ps( b_v, _mm_mul_ps( b_v, b_v ) ) ) ) ) ); return b; - } +} - #if 0 +#if 0 inline v4float rsqrt( const v4float &a ) { v4float b; @@ -1211,19 +1211,19 @@ namespace v4 return b; } - #endif +#endif - inline v4float rcp_approx( const v4float &a ) - { +inline v4float rcp_approx( const v4float& a ) +{ v4float b; b.v = _mm_rcp_ps( a.v ); return b; - } +} - inline v4float rcp( const v4float &a ) - { +inline v4float rcp( const v4float& a ) +{ v4float b; __m128 a_v = a.v, b_v; @@ -1231,15 +1231,12 @@ namespace v4 b_v = _mm_rcp_ps( a_v ); b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ), - _mm_mul_ps( a_v, - _mm_mul_ps( b_v, b_v ) - ) - ); + _mm_mul_ps( a_v, _mm_mul_ps( b_v, b_v ) ) ); return b; - } +} - #if 0 +#if 0 inline v4float rcp( const v4float &a ) { v4float b; @@ -1251,112 +1248,103 @@ namespace v4 return b; } - #endif +#endif - inline v4float fma( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fma( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v ); return d; - } +} - inline v4float fms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v ); return d; - } +} - inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) - { +inline v4float fnms( const v4float& a, const v4float& b, const v4float& c ) +{ v4float d; d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) ); return d; - } +} - inline v4float clear_bits( const v4int &m, const v4float &a ) - { +inline v4float clear_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_andnot_ps( m.v, a.v ); return b; - } +} - inline v4float set_bits( const v4int &m, const v4float &a ) - { +inline v4float set_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_or_ps( m.v, a.v ); return b; - } +} - inline v4float toggle_bits( const v4int &m, const v4float &a ) - { +inline v4float toggle_bits( const v4int& m, const v4float& a ) +{ v4float b; b.v = _mm_xor_ps( m.v, a.v ); return b; - } +} - inline void increment_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void increment_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) ); - } +} - inline void decrement_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void decrement_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) ); - } +} - inline void scale_4x1( float * ALIGNED(16) p, - const v4float &a ) - { +inline void scale_4x1( float* ALIGNED( 16 ) p, const v4float& a ) +{ _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) ); - } +} - // Given wl = x y z w, compute: - // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) - // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) - inline void trilinear( v4float &wl, v4float &wh ) - { +// Given wl = x y z w, compute: +// wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z) +// wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z) +inline void trilinear( v4float& wl, v4float& wh ) +{ __m128 l = _mm_set1_ps( 1.0f ); __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f ); __m128 z = wl.v, xy; - xy = _mm_add_ps( l, - _mm_xor_ps( s, - _mm_shuffle_ps( z, z, PERM(0,0,1,1) ) - ) - ); + xy = _mm_add_ps( + l, _mm_xor_ps( s, _mm_shuffle_ps( z, z, PERM( 0, 0, 1, 1 ) ) ) ); - z = _mm_add_ps( l, - _mm_xor_ps( s, - _mm_shuffle_ps( z, z, PERM(2,2,2,2) ) - ) - ); + z = _mm_add_ps( + l, _mm_xor_ps( s, _mm_shuffle_ps( z, z, PERM( 2, 2, 2, 2 ) ) ) ); - xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ), - _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) ); + xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM( 0, 1, 0, 1 ) ), + _mm_shuffle_ps( xy, xy, PERM( 2, 2, 3, 3 ) ) ); - wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) ); + wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM( 0, 0, 0, 0 ) ) ); - wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) ); - } + wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM( 1, 1, 1, 1 ) ) ); +} - #undef PERM +#undef PERM } // namespace v4 diff --git a/src/util/v8/v8.h b/src/util/v8/v8.h index 3275225b..1a886b4e 100644 --- a/src/util/v8/v8.h +++ b/src/util/v8/v8.h @@ -4,13 +4,13 @@ #define IN_v8_h // FIXME: SHOULDN'T THIS INCLUDE UTIL_BASE.H? #ifdef __cplusplus -# if defined USE_V8_PORTABLE -# include "v8_portable.h" -# elif defined USE_V8_AVX2 -# include "v8_avx2.h" -# elif defined USE_V8_AVX -# include "v8_avx.h" -# endif +#if defined USE_V8_PORTABLE +#include "v8_portable.h" +#elif defined USE_V8_AVX2 +#include "v8_avx2.h" +#elif defined USE_V8_AVX +#include "v8_avx.h" +#endif #endif #undef IN_v8_h #endif // _v8_h_ diff --git a/src/util/v8/v8_avx.h b/src/util/v8/v8_avx.h index 7a78a228..5d9f14d3 100644 --- a/src/util/v8/v8_avx.h +++ b/src/util/v8/v8_avx.h @@ -12,209 +12,182 @@ #define V8_AVX_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) // Why does GNU not define this function? // #ifdef __GNUC__ #ifndef __INTEL_COMPILER -#define _mm256_set_m128(va, vb) \ - _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1) +#define _mm256_set_m128( va, vb ) \ + _mm256_insertf128_ps( _mm256_castps128_ps256( vb ), va, 1 ) #endif namespace v8 { - class v8; - class v8int; - class v8float; +class v8; +class v8int; +class v8float; - //////////////// - // v8 base class +//////////////// +// v8 base class - class v8 - { +class v8 +{ friend class v8int; friend class v8float; // v8 miscellaneous friends - friend inline int any( const v8 &a ) ALWAYS_INLINE; - friend inline int all( const v8 &a ) ALWAYS_INLINE; + friend inline int any( const v8& a ) ALWAYS_INLINE; + friend inline int all( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 splat( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 splat( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 shuffle( const v8& a ) ALWAYS_INLINE; - friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE; - friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE; + friend inline void swap( v8& a, v8& b ) ALWAYS_INLINE; + friend inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, + v8& a5, v8& a6, v8& a7 ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& a, + const v8& b ) ALWAYS_INLINE; // v8 memory manipulation friends - friend inline void load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE; - friend inline void store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void load_8x1( const void* ALIGNED( 16 ) p, + v8& a ) ALWAYS_INLINE; + friend inline void store_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void stream_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void clear_8x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; + friend inline void copy_8x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; + friend inline void swap_8x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v8 transposed memory manipulation friends // Note: Half aligned values are permissible in the 8x2_tr variants. - friend inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) ALWAYS_INLINE; - - friend inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) ALWAYS_INLINE; - - friend inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE; - - friend inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE; - - friend inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE; - - friend inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE; - - friend inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3, - void * ALIGNED(8) a4, - void * ALIGNED(8) a5, - void * ALIGNED(8) a6, - void * ALIGNED(8) a7 ) ALWAYS_INLINE; - - friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x4_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x8_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - const v8 &e, const v8 &f, - const v8 &g, const v8 &h, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; + friend inline void load_8x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + const void* a4, const void* a5, + const void* a6, const void* a7, + v8& a ) ALWAYS_INLINE; + + friend inline void + load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, + v8& a, v8& b ) ALWAYS_INLINE; + + friend inline void + load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c ) ALWAYS_INLINE; + + friend inline void + load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d ) ALWAYS_INLINE; + + friend inline void + load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, + v8& h ) ALWAYS_INLINE; + + friend inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, + void* a3, void* a4, void* a5, void* a6, + void* a7 ) ALWAYS_INLINE; + + friend inline void + store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) ALWAYS_INLINE; + + friend inline void + store_8x3_tr( const v8& a, const v8& b, const v8& c, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, + void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x4_tr( + const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x8_tr( + const v8& a, const v8& b, const v8& c, const v8& d, const v8& e, + const v8& f, const v8& g, const v8& h, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; protected: - - union - { - int i[8]; - float f[8]; - __m256 v; + union { + int i[8]; + float f[8]; + __m256 v; }; public: + v8() {} // Default constructor - v8() {} // Default constructor - - v8( const v8 &a ) // Copy constructor + v8( const v8& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v8() {} // Default destructor - }; + ~v8() {} // Default destructor +}; - // v8 miscellaneous functions +// v8 miscellaneous functions - inline int any( const v8 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3] || - a.i[4] || a.i[5] || a.i[6] || a.i[7]; - } +inline int any( const v8& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7]; +} - inline int all( const v8 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3] && - a.i[4] && a.i[5] && a.i[6] && a.i[7]; - } +inline int all( const v8& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7]; +} - template - inline v8 splat( const v8 & a ) - { +template +inline v8 splat( const v8& a ) +{ v8 b; b.v = _mm256_set1_ps( a.v[n] ); return b; - } +} - template - inline v8 shuffle( const v8 & a ) - { +template +inline v8 shuffle( const v8& a ) +{ v8 b; b.i[0] = a.i[i0]; @@ -227,20 +200,20 @@ namespace v8 b.i[7] = a.i[i7]; return b; - } +} - inline void swap( v8 &a, v8 &b ) - { +inline void swap( v8& a, v8& b ) +{ __m256 a_v = a.v; a.v = b.v; b.v = a_v; - } +} - inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) - { +inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, v8& a5, v8& a6, + v8& a7 ) +{ __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 u0, u1, u2, u3, u4, u5, u6, u7; @@ -271,222 +244,206 @@ namespace v8 a5.v = _mm256_permute2f128_ps( u1, u5, 0x31 ); a6.v = _mm256_permute2f128_ps( u2, u6, 0x31 ); a7.v = _mm256_permute2f128_ps( u3, u7, 0x31 ); - } - - // v8 memory manipulation functions - - inline void load_8x1( const void * ALIGNED(16) p, - v8 &a ) - { - a.i[0] = ((const int * ALIGNED(16))p)[0]; - a.i[1] = ((const int * ALIGNED(16))p)[1]; - a.i[2] = ((const int * ALIGNED(16))p)[2]; - a.i[3] = ((const int * ALIGNED(16))p)[3]; - a.i[4] = ((const int * ALIGNED(16))p)[4]; - a.i[5] = ((const int * ALIGNED(16))p)[5]; - a.i[6] = ((const int * ALIGNED(16))p)[6]; - a.i[7] = ((const int * ALIGNED(16))p)[7]; - } - - inline void store_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } +} - inline void stream_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void clear_8x1( void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = 0; - ((int * ALIGNED(16))p)[1] = 0; - ((int * ALIGNED(16))p)[2] = 0; - ((int * ALIGNED(16))p)[3] = 0; - ((int * ALIGNED(16))p)[4] = 0; - ((int * ALIGNED(16))p)[5] = 0; - ((int * ALIGNED(16))p)[6] = 0; - ((int * ALIGNED(16))p)[7] = 0; - } - - // FIXME: Ordering semantics - inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0]; - ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1]; - ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2]; - ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3]; - ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4]; - ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5]; - ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6]; - ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7]; - } +// v8 memory manipulation functions - inline void swap_8x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +inline void load_8x1( const void* ALIGNED( 16 ) p, v8& a ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 16 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 16 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 16 ))p )[3]; + a.i[4] = ( (const int* ALIGNED( 16 ))p )[4]; + a.i[5] = ( (const int* ALIGNED( 16 ))p )[5]; + a.i[6] = ( (const int* ALIGNED( 16 ))p )[6]; + a.i[7] = ( (const int* ALIGNED( 16 ))p )[7]; +} + +inline void store_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void stream_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void clear_8x1( void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = 0; + ( (int* ALIGNED( 16 ))p )[1] = 0; + ( (int* ALIGNED( 16 ))p )[2] = 0; + ( (int* ALIGNED( 16 ))p )[3] = 0; + ( (int* ALIGNED( 16 ))p )[4] = 0; + ( (int* ALIGNED( 16 ))p )[5] = 0; + ( (int* ALIGNED( 16 ))p )[6] = 0; + ( (int* ALIGNED( 16 ))p )[7] = 0; +} + +// FIXME: Ordering semantics +inline void copy_8x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + ( (int* ALIGNED( 16 ))dst )[0] = ( (const int* ALIGNED( 16 ))src )[0]; + ( (int* ALIGNED( 16 ))dst )[1] = ( (const int* ALIGNED( 16 ))src )[1]; + ( (int* ALIGNED( 16 ))dst )[2] = ( (const int* ALIGNED( 16 ))src )[2]; + ( (int* ALIGNED( 16 ))dst )[3] = ( (const int* ALIGNED( 16 ))src )[3]; + ( (int* ALIGNED( 16 ))dst )[4] = ( (const int* ALIGNED( 16 ))src )[4]; + ( (int* ALIGNED( 16 ))dst )[5] = ( (const int* ALIGNED( 16 ))src )[5]; + ( (int* ALIGNED( 16 ))dst )[6] = ( (const int* ALIGNED( 16 ))src )[6]; + ( (int* ALIGNED( 16 ))dst )[7] = ( (const int* ALIGNED( 16 ))src )[7]; +} + +inline void swap_8x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; - t = ((int * ALIGNED(16))a)[0]; - ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; - ((int * ALIGNED(16))b)[0] = t; + t = ( (int* ALIGNED( 16 ))a )[0]; + ( (int* ALIGNED( 16 ))a )[0] = ( (int* ALIGNED( 16 ))b )[0]; + ( (int* ALIGNED( 16 ))b )[0] = t; - t = ((int * ALIGNED(16))a)[1]; - ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; - ((int * ALIGNED(16))b)[1] = t; + t = ( (int* ALIGNED( 16 ))a )[1]; + ( (int* ALIGNED( 16 ))a )[1] = ( (int* ALIGNED( 16 ))b )[1]; + ( (int* ALIGNED( 16 ))b )[1] = t; - t = ((int * ALIGNED(16))a)[2]; - ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; - ((int * ALIGNED(16))b)[2] = t; + t = ( (int* ALIGNED( 16 ))a )[2]; + ( (int* ALIGNED( 16 ))a )[2] = ( (int* ALIGNED( 16 ))b )[2]; + ( (int* ALIGNED( 16 ))b )[2] = t; - t = ((int * ALIGNED(16))a)[3]; - ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; - ((int * ALIGNED(16))b)[3] = t; + t = ( (int* ALIGNED( 16 ))a )[3]; + ( (int* ALIGNED( 16 ))a )[3] = ( (int* ALIGNED( 16 ))b )[3]; + ( (int* ALIGNED( 16 ))b )[3] = t; - t = ((int * ALIGNED(16))a)[4]; - ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4]; - ((int * ALIGNED(16))b)[4] = t; + t = ( (int* ALIGNED( 16 ))a )[4]; + ( (int* ALIGNED( 16 ))a )[4] = ( (int* ALIGNED( 16 ))b )[4]; + ( (int* ALIGNED( 16 ))b )[4] = t; - t = ((int * ALIGNED(16))a)[5]; - ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5]; - ((int * ALIGNED(16))b)[5] = t; + t = ( (int* ALIGNED( 16 ))a )[5]; + ( (int* ALIGNED( 16 ))a )[5] = ( (int* ALIGNED( 16 ))b )[5]; + ( (int* ALIGNED( 16 ))b )[5] = t; - t = ((int * ALIGNED(16))a)[6]; - ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6]; - ((int * ALIGNED(16))b)[6] = t; + t = ( (int* ALIGNED( 16 ))a )[6]; + ( (int* ALIGNED( 16 ))a )[6] = ( (int* ALIGNED( 16 ))b )[6]; + ( (int* ALIGNED( 16 ))b )[6] = t; - t = ((int * ALIGNED(16))a)[7]; - ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7]; - ((int * ALIGNED(16))b)[7] = t; - } - - // v8 transposed memory manipulation functions + t = ( (int* ALIGNED( 16 ))a )[7]; + ( (int* ALIGNED( 16 ))a )[7] = ( (int* ALIGNED( 16 ))b )[7]; + ( (int* ALIGNED( 16 ))b )[7] = t; +} - inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) - { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; - a.i[4] = ((const int *)a4)[0]; - a.i[5] = ((const int *)a5)[0]; - a.i[6] = ((const int *)a6)[0]; - a.i[7] = ((const int *)a7)[0]; - } +// v8 transposed memory manipulation functions - inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) - { +inline void load_8x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, const void* a4, const void* a5, + const void* a6, const void* a7, v8& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; + a.i[4] = ( (const int*)a4 )[0]; + a.i[5] = ( (const int*)a5 )[0]; + a.i[6] = ( (const int*)a6 )[0]; + a.i[7] = ( (const int*)a7 )[0]; +} + +inline void +load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, v8& a, + v8& b ) +{ __m128 zero; __m128 t0, t1, t2, t3; __m256 u0, u1; zero = _mm_setzero_ps(); - t0 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a0 ), (__m64 *)a1 ); - t1 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a2 ), (__m64 *)a3 ); - t2 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a4 ), (__m64 *)a5 ); - t3 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a6 ), (__m64 *)a7 ); + t0 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a0 ), (__m64*)a1 ); + t1 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a2 ), (__m64*)a3 ); + t2 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a4 ), (__m64*)a5 ); + t3 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a6 ), (__m64*)a7 ); u0 = _mm256_set_m128( t2, t0 ); u1 = _mm256_set_m128( t3, t1 ); a.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 2, 0, 2, 0 ) ); b.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 3, 1, 3, 1 ) ); - } - - inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - } - - inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) - { +} + +inline void +load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; +} + +inline void +load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d ) +{ __m256 tmp0, tmp1, tmp2, tmp3; - a.v = _mm256_set_m128( _mm_load_ps( (const float *)a4 ), - _mm_load_ps( (const float *)a0 ) ); - b.v = _mm256_set_m128( _mm_load_ps( (const float *)a5 ), - _mm_load_ps( (const float *)a1 ) ); - c.v = _mm256_set_m128( _mm_load_ps( (const float *)a6 ), - _mm_load_ps( (const float *)a2 ) ); - d.v = _mm256_set_m128( _mm_load_ps( (const float *)a7 ), - _mm_load_ps( (const float *)a3 ) ); + a.v = _mm256_set_m128( _mm_load_ps( (const float*)a4 ), + _mm_load_ps( (const float*)a0 ) ); + b.v = _mm256_set_m128( _mm_load_ps( (const float*)a5 ), + _mm_load_ps( (const float*)a1 ) ); + c.v = _mm256_set_m128( _mm_load_ps( (const float*)a6 ), + _mm_load_ps( (const float*)a2 ) ); + d.v = _mm256_set_m128( _mm_load_ps( (const float*)a7 ), + _mm_load_ps( (const float*)a3 ) ); tmp0 = _mm256_shuffle_ps( a.v, b.v, 0x44 ); tmp2 = _mm256_shuffle_ps( a.v, b.v, 0xEE ); @@ -497,31 +454,27 @@ namespace v8 b.v = _mm256_shuffle_ps( tmp0, tmp1, 0xDD ); c.v = _mm256_shuffle_ps( tmp2, tmp3, 0x88 ); d.v = _mm256_shuffle_ps( tmp2, tmp3, 0xDD ); - } - - inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) - { +} + +inline void +load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, v8& h ) +{ __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 u0, u1, u2, u3, u4, u5, u6, u7; - a.v = _mm256_load_ps( (const float *)a0 ); - b.v = _mm256_load_ps( (const float *)a1 ); - c.v = _mm256_load_ps( (const float *)a2 ); - d.v = _mm256_load_ps( (const float *)a3 ); - e.v = _mm256_load_ps( (const float *)a4 ); - f.v = _mm256_load_ps( (const float *)a5 ); - g.v = _mm256_load_ps( (const float *)a6 ); - h.v = _mm256_load_ps( (const float *)a7 ); + a.v = _mm256_load_ps( (const float*)a0 ); + b.v = _mm256_load_ps( (const float*)a1 ); + c.v = _mm256_load_ps( (const float*)a2 ); + d.v = _mm256_load_ps( (const float*)a3 ); + e.v = _mm256_load_ps( (const float*)a4 ); + f.v = _mm256_load_ps( (const float*)a5 ); + g.v = _mm256_load_ps( (const float*)a6 ); + h.v = _mm256_load_ps( (const float*)a7 ); t0 = _mm256_unpacklo_ps( a.v, b.v ); t1 = _mm256_unpackhi_ps( a.v, b.v ); @@ -549,28 +502,27 @@ namespace v8 f.v = _mm256_permute2f128_ps( u1, u5, 0x31 ); g.v = _mm256_permute2f128_ps( u2, u6, 0x31 ); h.v = _mm256_permute2f128_ps( u3, u7, 0x31 ); - } - - inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) - { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; - ((int *)a4)[0] = a.i[4]; - ((int *)a5)[0] = a.i[5]; - ((int *)a6)[0] = a.i[6]; - ((int *)a7)[0] = a.i[7]; - } +} - inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3, - void * ALIGNED(8) a4, void * ALIGNED(8) a5, - void * ALIGNED(8) a6, void * ALIGNED(8) a7 ) - { +inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, void* a3, + void* a4, void* a5, void* a6, void* a7 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; + ( (int*)a4 )[0] = a.i[4]; + ( (int*)a5 )[0] = a.i[5]; + ( (int*)a6 )[0] = a.i[6]; + ( (int*)a7 )[0] = a.i[7]; +} + +inline void store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) +{ __m256 u0, u1; __m128 t0, t1, t2, t3; @@ -582,64 +534,64 @@ namespace v8 t2 = _mm256_extractf128_ps( u0, 1 ); t3 = _mm256_extractf128_ps( u1, 1 ); - _mm_storel_pi( (__m64 *) a0, t0 ); - _mm_storeh_pi( (__m64 *) a1, t0 ); - - _mm_storel_pi( (__m64 *) a2, t1 ); - _mm_storeh_pi( (__m64 *) a3, t1 ); - - _mm_storel_pi( (__m64 *) a4, t2 ); - _mm_storeh_pi( (__m64 *) a5, t2 ); - - _mm_storel_pi( (__m64 *) a6, t3 ); - _mm_storeh_pi( (__m64 *) a7, t3 ); - } - - inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; + _mm_storel_pi( (__m64*)a0, t0 ); + _mm_storeh_pi( (__m64*)a1, t0 ); - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; + _mm_storel_pi( (__m64*)a2, t1 ); + _mm_storeh_pi( (__m64*)a3, t1 ); - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; + _mm_storel_pi( (__m64*)a4, t2 ); + _mm_storeh_pi( (__m64*)a5, t2 ); - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - } + _mm_storel_pi( (__m64*)a6, t3 ); + _mm_storeh_pi( (__m64*)a7, t3 ); +} - inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { +inline void store_8x3_tr( const v8& a, const v8& b, const v8& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; +} + +inline void store_8x4_tr( const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ __m256 u0, u1, u2, u3; __m256 t0, t1, t2, t3; __m128 s0, s1, s2, s3, s4, s5, s6, s7; @@ -664,23 +616,23 @@ namespace v8 s6 = _mm256_extractf128_ps( t2, 1 ); s7 = _mm256_extractf128_ps( t3, 1 ); - _mm_store_ps( (float *) a0, s0 ); - _mm_store_ps( (float *) a1, s1 ); - _mm_store_ps( (float *) a2, s2 ); - _mm_store_ps( (float *) a3, s3 ); - _mm_store_ps( (float *) a4, s4 ); - _mm_store_ps( (float *) a5, s5 ); - _mm_store_ps( (float *) a6, s6 ); - _mm_store_ps( (float *) a7, s7 ); - } - - inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - const v8 &e, const v8 &f, const v8 &g, const v8 &h, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { + _mm_store_ps( (float*)a0, s0 ); + _mm_store_ps( (float*)a1, s1 ); + _mm_store_ps( (float*)a2, s2 ); + _mm_store_ps( (float*)a3, s3 ); + _mm_store_ps( (float*)a4, s4 ); + _mm_store_ps( (float*)a5, s5 ); + _mm_store_ps( (float*)a6, s6 ); + _mm_store_ps( (float*)a7, s7 ); +} + +inline void store_8x8_tr( const v8& a, const v8& b, const v8& c, const v8& d, + const v8& e, const v8& f, const v8& g, const v8& h, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 u0, u1, u2, u3, u4, u5, u6, u7; @@ -712,253 +664,276 @@ namespace v8 t6 = _mm256_permute2f128_ps( u2, u6, 0x31 ); t7 = _mm256_permute2f128_ps( u3, u7, 0x31 ); - _mm256_store_ps( (float *)a0, t0 ); - _mm256_store_ps( (float *)a1, t1 ); - _mm256_store_ps( (float *)a2, t2 ); - _mm256_store_ps( (float *)a3, t3 ); - _mm256_store_ps( (float *)a4, t4 ); - _mm256_store_ps( (float *)a5, t5 ); - _mm256_store_ps( (float *)a6, t6 ); - _mm256_store_ps( (float *)a7, t7 ); - } + _mm256_store_ps( (float*)a0, t0 ); + _mm256_store_ps( (float*)a1, t1 ); + _mm256_store_ps( (float*)a2, t2 ); + _mm256_store_ps( (float*)a3, t3 ); + _mm256_store_ps( (float*)a4, t4 ); + _mm256_store_ps( (float*)a5, t5 ); + _mm256_store_ps( (float*)a6, t6 ); + _mm256_store_ps( (float*)a7, t7 ); +} - ////////////// - // v8int class +////////////// +// v8int class - class v8int : public v8 - { +class v8int : public v8 +{ // v8int prefix unary operator friends - friend inline v8int operator +( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator ~( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8int & a ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator~( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8int prefix increment / decrement operator friends - friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a ) ALWAYS_INLINE; // v8int postfix increment / decrement operator friends - friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a, int ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a, int ) ALWAYS_INLINE; // v8int binary operator friends - friend inline v8int operator +( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator *( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator /( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator %( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ^( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator |( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator*(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator/( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator%( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator^( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator|( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int logical operator friends - friend inline v8int operator <( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8int abs( const v8int &a ) ALWAYS_INLINE; - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; + friend inline v8int abs( const v8int& a ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& t, + const v8& f ) ALWAYS_INLINE; // v8float unary operator friends - friend inline v8int operator !( const v8float & a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float miscellaneous friends - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; public: - // v8int constructors / destructors - v8int() {} // Default constructor + v8int() {} // Default constructor - v8int( const v8int &a ) // Copy constructor + v8int( const v8int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v8int( const v8 &a ) // Init from mixed + v8int( const v8& a ) // Init from mixed { - v = a.v; + v = a.v; } - v8int( int a ) // Init from scalar + v8int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; - u.i = a; - v = _mm256_set1_ps( u.f ); + union { + int i; + float f; + } u; + u.i = a; + v = _mm256_set1_ps( u.f ); } - v8int( int i0, int i1, int i2, int i3, - int i4, int i5, int i6, int i7 ) // Init from scalars + v8int( int i0, int i1, int i2, int i3, int i4, int i5, int i6, + int i7 ) // Init from scalars { - union - { - int i; - float f; - } u0, u1, u2, u3, u4, u5, u6, u7; - - u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3; - u4.i = i4; u5.i = i5; u6.i = i6; u7.i = i7; - - v = _mm256_setr_ps( u0.f, u1.f, u2.f, u3.f, - u4.f, u5.f, u6.f, u7.f ); + union { + int i; + float f; + } u0, u1, u2, u3, u4, u5, u6, u7; + + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; + u4.i = i4; + u5.i = i5; + u6.i = i6; + u7.i = i7; + + v = _mm256_setr_ps( u0.f, u1.f, u2.f, u3.f, u4.f, u5.f, u6.f, u7.f ); } - ~v8int() {} // Destructor + ~v8int() {} // Destructor // v8int assignment operators -# define ASSIGN(op) \ - inline v8int &operator op( const v8int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - i[4] op b.i[4]; \ - i[5] op b.i[5]; \ - i[6] op b.i[6]; \ - i[7] op b.i[7]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8int& operator op( const v8int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + i[4] op b.i[4]; \ + i[5] op b.i[5]; \ + i[6] op b.i[6]; \ + i[7] op b.i[7]; \ + return *this; \ } - inline v8int &operator =( const v8int &b ) + inline v8int& operator=( const v8int& b ) { - v = b.v; - return *this; + v = b.v; + return *this; } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) - inline v8int &operator ^=( const v8int &b ) + inline v8int& operator^=( const v8int& b ) { - v = _mm256_xor_ps( v, b.v ); - return *this; + v = _mm256_xor_ps( v, b.v ); + return *this; } - inline v8int &operator &=( const v8int &b ) + inline v8int& operator&=( const v8int& b ) { - v = _mm256_and_ps( v, b.v ); - return *this; + v = _mm256_and_ps( v, b.v ); + return *this; } - inline v8int &operator |=( const v8int &b ) + inline v8int& operator|=( const v8int& b ) { - v = _mm256_or_ps( v, b.v ); - return *this; + v = _mm256_or_ps( v, b.v ); + return *this; } - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v8int member access operator - inline int &operator []( int n ) - { - return i[n]; - } - - inline int operator ()( int n ) - { - return i[n]; + inline int& operator[]( int n ) { return i[n]; } + + inline int operator()( int n ) { return i[n]; } +}; + +// v8int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v8int operator op( const v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ } - }; - - // v8int prefix unary operators - -# define PREFIX_UNARY(op) \ - inline v8int operator op( const v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } - inline v8int operator +( const v8int & a ) - { +inline v8int operator+( const v8int& a ) +{ v8int b; b.v = a.v; return b; - } +} - PREFIX_UNARY(-) +PREFIX_UNARY( -) - inline v8int operator !( const v8int & a ) - { +inline v8int operator!( const v8int& a ) +{ v8int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); - b.i[4] = - ( !a.i[4] ); - b.i[5] = - ( !a.i[5] ); - b.i[6] = - ( !a.i[6] ); - b.i[7] = - ( !a.i[7] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); + b.i[4] = -( !a.i[4] ); + b.i[5] = -( !a.i[5] ); + b.i[6] = -( !a.i[6] ); + b.i[7] = -( !a.i[7] ); return b; - } +} - inline v8int operator ~( const v8int & a ) - { +inline v8int operator~( const v8int& a ) +{ v8int b; - union - { - int i; - float f; + union { + int i; + float f; } u; u.i = -1; @@ -966,141 +941,141 @@ namespace v8 b.v = _mm256_xor_ps( a.v, _mm256_set1_ps( u.f ) ); return b; - } - -# undef PREFIX_UNARY - - // v8int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v8int operator op( v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } +} + +#undef PREFIX_UNARY + +// v8int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ + } - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v8int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v8int operator op( v8int & a, int ) \ - { \ - v8int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - b.i[4] = ( a.i[4] op ); \ - b.i[5] = ( a.i[5] op ); \ - b.i[6] = ( a.i[6] op ); \ - b.i[7] = ( a.i[7] op ); \ - return b; \ - } +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v8int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a, int ) \ + { \ + v8int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + b.i[4] = ( a.i[4] op ); \ + b.i[5] = ( a.i[5] op ); \ + b.i[6] = ( a.i[6] op ); \ + b.i[7] = ( a.i[7] op ); \ + return b; \ + } - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v8int binary operators - -# define BINARY(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - c.i[4] = a.i[4] op b.i[4]; \ - c.i[5] = a.i[5] op b.i[5]; \ - c.i[6] = a.i[6] op b.i[6]; \ - c.i[7] = a.i[7] op b.i[7]; \ - return c; \ - } +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v8int binary operators + +#define BINARY( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + c.i[4] = a.i[4] op b.i[4]; \ + c.i[5] = a.i[5] op b.i[5]; \ + c.i[6] = a.i[6] op b.i[6]; \ + c.i[7] = a.i[7] op b.i[7]; \ + return c; \ + } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) - inline v8int operator ^( const v8int &a, const v8int &b ) - { +inline v8int operator^( const v8int& a, const v8int& b ) +{ v8int c; c.v = _mm256_xor_ps( a.v, b.v ); return c; - } +} - inline v8int operator &( const v8int &a, const v8int &b ) - { +inline v8int operator&( const v8int& a, const v8int& b ) +{ v8int c; c.v = _mm256_and_ps( a.v, b.v ); return c; - } +} - inline v8int operator |( const v8int &a, const v8int &b ) - { +inline v8int operator|( const v8int& a, const v8int& b ) +{ v8int c; c.v = _mm256_or_ps( a.v, b.v ); return c; - } - - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v8int logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - c.i[4] = - ( a.i[4] op b.i[4] ); \ - c.i[5] = - ( a.i[5] op b.i[5] ); \ - c.i[6] = - ( a.i[6] op b.i[6] ); \ - c.i[7] = - ( a.i[7] op b.i[7] ); \ - return c; \ - } +} + +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v8int logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + c.i[4] = -( a.i[4] op b.i[4] ); \ + c.i[5] = -( a.i[5] op b.i[5] ); \ + c.i[6] = -( a.i[6] op b.i[6] ); \ + c.i[7] = -( a.i[7] op b.i[7] ); \ + return c; \ + } - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) -# undef LOGICAL +#undef LOGICAL - // v8int miscellaneous functions +// v8int miscellaneous functions - inline v8int abs( const v8int &a ) - { +inline v8int abs( const v8int& a ) +{ v8int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -1113,209 +1088,238 @@ namespace v8 b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7]; return b; - } +} - inline v8 czero( const v8int &c, const v8 &a ) - { +inline v8 czero( const v8int& c, const v8& a ) +{ v8 b; b.v = _mm256_andnot_ps( c.v, a.v ); return b; - } +} - inline v8 notczero( const v8int &c, const v8 &a ) - { +inline v8 notczero( const v8int& c, const v8& a ) +{ v8 b; b.v = _mm256_and_ps( c.v, a.v ); return b; - } +} - inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) - { +inline v8 merge( const v8int& c, const v8& t, const v8& f ) +{ __m256 c_v = c.v; v8 tf; - tf.v = _mm256_or_ps( _mm256_andnot_ps( c_v, f.v ), - _mm256_and_ps( c_v, t.v ) ); + tf.v = + _mm256_or_ps( _mm256_andnot_ps( c_v, f.v ), _mm256_and_ps( c_v, t.v ) ); return tf; - } +} - //////////////// - // v8float class +//////////////// +// v8float class - class v8float : public v8 - { +class v8float : public v8 +{ // v8float prefix unary operator friends - friend inline v8float operator +( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator ~( const v8float &a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8float &a ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator~( const v8float& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8float prefix increment / decrement operator friends - friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a ) ALWAYS_INLINE; // v8float postfix increment / decrement operator friends - friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a, int ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a, int ) ALWAYS_INLINE; // v8float binary operator friends - friend inline v8float operator +( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator *( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator /( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator*(const v8float& a, + const v8float& b)ALWAYS_INLINE; + friend inline v8float operator/( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float math library friends -# define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v8float fn( const v8float &a, \ - const v8float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v8float fn( const v8float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v8float fn( const v8float& a, const v8float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v8float miscellaneous friends - friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rsqrt ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; + friend inline v8float rsqrt_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rsqrt( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp( const v8float& a ) ALWAYS_INLINE; + friend inline v8float fma( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fnms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline void increment_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void decrement_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void scale_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; public: - // v8float constructors / destructors - v8float() {} // Default constructor + v8float() {} // Default constructor - v8float( const v8float &a ) // Copy constructor + v8float( const v8float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v8float( const v8 &a ) // Init from mixed + v8float( const v8& a ) // Init from mixed { - v = a.v; + v = a.v; } - v8float( float a ) // Init from scalar + v8float( float a ) // Init from scalar { - v = _mm256_set1_ps( a ); + v = _mm256_set1_ps( a ); } - v8float( float f0, float f1, float f2, float f3, - float f4, float f5, float f6, float f7 ) // Init from scalars + v8float( float f0, float f1, float f2, float f3, float f4, float f5, + float f6, float f7 ) // Init from scalars { - v = _mm256_setr_ps( f0, f1, f2, f3, f4, f5, f6, f7 ); + v = _mm256_setr_ps( f0, f1, f2, f3, f4, f5, f6, f7 ); } - ~v8float() {} // Destructor + ~v8float() {} // Destructor // v8float assignment operators -# define ASSIGN(op,intrin) \ - inline v8float &operator op( const v8float &b ) \ - { \ - v = intrin(v,b.v); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v8float& operator op( const v8float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } - inline v8float &operator =( const v8float &b ) + inline v8float& operator=( const v8float& b ) { - v = b.v; - return *this; + v = b.v; + return *this; } - ASSIGN(+=,_mm256_add_ps) - ASSIGN(-=,_mm256_sub_ps) - ASSIGN(*=,_mm256_mul_ps) - ASSIGN(/=,_mm256_div_ps) + ASSIGN( +=, _mm256_add_ps ) + ASSIGN( -=, _mm256_sub_ps ) + ASSIGN( *=, _mm256_mul_ps ) + ASSIGN( /=, _mm256_div_ps ) -# undef ASSIGN +#undef ASSIGN // v8float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v8float prefix unary operators +// v8float prefix unary operators - inline v8float operator +( const v8float &a ) - { +inline v8float operator+( const v8float& a ) +{ v8float b; b.v = a.v; return b; - } +} - inline v8float operator -( const v8float &a ) - { +inline v8float operator-( const v8float& a ) +{ v8float b; b.v = _mm256_sub_ps( _mm256_setzero_ps(), a.v ); return b; - } +} - inline v8int operator !( const v8float &a ) - { +inline v8int operator!( const v8float& a ) +{ v8int b; b.v = _mm256_cmp_ps( _mm256_setzero_ps(), a.v, _CMP_EQ_OS ); return b; - } +} - // v8float prefix increment / decrement operators +// v8float prefix increment / decrement operators - inline v8float operator ++( v8float &a ) - { +inline v8float operator++( v8float& a ) +{ v8float b; __m256 t = _mm256_add_ps( a.v, _mm256_set1_ps( 1 ) ); @@ -1323,10 +1327,10 @@ namespace v8 b.v = t; return b; - } +} - inline v8float operator --( v8float &a ) - { +inline v8float operator--( v8float& a ) +{ v8float b; __m256 t = _mm256_sub_ps( a.v, _mm256_set1_ps( 1 ) ); @@ -1334,12 +1338,12 @@ namespace v8 b.v = t; return b; - } +} - // v8float postfix increment / decrement operators +// v8float postfix increment / decrement operators - inline v8float operator ++( v8float &a, int ) - { +inline v8float operator++( v8float& a, int ) +{ v8float b; __m256 a_v = a.v; @@ -1347,159 +1351,159 @@ namespace v8 b.v = a_v; return b; - } +} - inline v8float operator --( v8float &a, int ) - { +inline v8float operator--( v8float& a, int ) +{ v8float b; __m256 a_v = a.v; - a.v = _mm256_sub_ps(a_v, _mm256_set1_ps( 1 ) ); + a.v = _mm256_sub_ps( a_v, _mm256_set1_ps( 1 ) ); b.v = a_v; return b; - } +} - // v8float binary operators +// v8float binary operators -# define BINARY(op,intrin) \ - inline v8float operator op( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define BINARY( op, intrin ) \ + inline v8float operator op( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - BINARY( +, _mm256_add_ps ) - BINARY( -, _mm256_sub_ps ) - BINARY( *, _mm256_mul_ps ) - BINARY( /, _mm256_div_ps ) +BINARY( +, _mm256_add_ps ) +BINARY( -, _mm256_sub_ps ) +BINARY( *, _mm256_mul_ps ) +BINARY( /, _mm256_div_ps ) -# undef BINARY +#undef BINARY - // v8float logical operators +// v8float logical operators -# define LOGICAL(op,intrin,flag) \ - inline v8int operator op( const v8float &a, const v8float &b ) \ - { \ - v8int c; \ - c.v = intrin( a.v, b.v, flag ); \ - return c; \ - } +#define LOGICAL( op, intrin, flag ) \ + inline v8int operator op( const v8float& a, const v8float& b ) \ + { \ + v8int c; \ + c.v = intrin( a.v, b.v, flag ); \ + return c; \ + } - LOGICAL( <, _mm256_cmp_ps, _CMP_LT_OS ) - LOGICAL( >, _mm256_cmp_ps, _CMP_GT_OS ) - LOGICAL( ==, _mm256_cmp_ps, _CMP_EQ_OS ) - LOGICAL( !=, _mm256_cmp_ps, _CMP_NEQ_OS ) - LOGICAL( <=, _mm256_cmp_ps, _CMP_LE_OS ) - LOGICAL( >=, _mm256_cmp_ps, _CMP_GE_OS ) +LOGICAL( <, _mm256_cmp_ps, _CMP_LT_OS ) +LOGICAL( >, _mm256_cmp_ps, _CMP_GT_OS ) +LOGICAL( ==, _mm256_cmp_ps, _CMP_EQ_OS ) +LOGICAL( !=, _mm256_cmp_ps, _CMP_NEQ_OS ) +LOGICAL( <=, _mm256_cmp_ps, _CMP_LE_OS ) +LOGICAL( >=, _mm256_cmp_ps, _CMP_GE_OS ) - inline v8int operator &&( const v8float &a, const v8float &b ) - { +inline v8int operator&&( const v8float& a, const v8float& b ) +{ v8int c; __m256 vzero = _mm256_setzero_ps(); c.v = _mm256_and_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ), - _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); + _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); return c; - } +} - inline v8int operator ||( const v8float &a, const v8float &b ) - { +inline v8int operator||( const v8float& a, const v8float& b ) +{ v8int c; __m256 vzero = _mm256_setzero_ps(); c.v = _mm256_or_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ), - _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); + _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); return c; - } - -# undef LOGICAL - - // v8float math library functions - -# define CMATH_FR1(fn) \ - inline v8float fn( const v8float &a ) \ - { \ - v8float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - b.f[4] = ::fn( a.f[4] ); \ - b.f[5] = ::fn( a.f[5] ); \ - b.f[6] = ::fn( a.f[6] ); \ - b.f[7] = ::fn( a.f[7] ); \ - return b; \ - } +} + +#undef LOGICAL + +// v8float math library functions + +#define CMATH_FR1( fn ) \ + inline v8float fn( const v8float& a ) \ + { \ + v8float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + b.f[4] = ::fn( a.f[4] ); \ + b.f[5] = ::fn( a.f[5] ); \ + b.f[6] = ::fn( a.f[6] ); \ + b.f[7] = ::fn( a.f[7] ); \ + return b; \ + } -# define CMATH_FR2(fn) \ - inline v8float fn( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - c.f[4] = ::fn( a.f[4], b.f[4] ); \ - c.f[5] = ::fn( a.f[5], b.f[5] ); \ - c.f[6] = ::fn( a.f[6], b.f[6] ); \ - c.f[7] = ::fn( a.f[7], b.f[7] ); \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v8float fn( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + c.f[4] = ::fn( a.f[4], b.f[4] ); \ + c.f[5] = ::fn( a.f[5], b.f[5] ); \ + c.f[6] = ::fn( a.f[6], b.f[6] ); \ + c.f[7] = ::fn( a.f[7], b.f[7] ); \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) - inline v8float fabs( const v8float &a ) - { + inline v8float fabs( const v8float& a ) +{ v8float b; b.v = _mm256_andnot_ps( _mm256_set1_ps( -0.f ), a.v ); return b; - } +} - inline v8float sqrt( const v8float &a ) - { +inline v8float sqrt( const v8float& a ) +{ v8float b; b.v = _mm256_sqrt_ps( a.v ); return b; - } +} - inline v8float copysign( const v8float &a, const v8float &b ) - { +inline v8float copysign( const v8float& a, const v8float& b ) +{ v8float c; __m256 t = _mm256_set1_ps( -0.f ); - c.v = _mm256_or_ps( _mm256_and_ps( t, b.v ), - _mm256_andnot_ps( t, a.v ) ); + c.v = _mm256_or_ps( _mm256_and_ps( t, b.v ), _mm256_andnot_ps( t, a.v ) ); return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v8float miscellaneous functions +// v8float miscellaneous functions - inline v8float rsqrt_approx( const v8float &a ) - { +inline v8float rsqrt_approx( const v8float& a ) +{ v8float b; - b.v = _mm256_rsqrt_ps(a.v); + b.v = _mm256_rsqrt_ps( a.v ); return b; - } +} - #if 0 +#if 0 inline v8float rsqrt( const v8float &a ) { v8float b; @@ -1515,27 +1519,29 @@ namespace v8 return b; } - #endif +#endif - inline v8float rsqrt( const v8float &a ) - { +inline v8float rsqrt( const v8float& a ) +{ v8float b; __m256 a_v = a.v, b_v; - b_v = _mm256_rsqrt_ps(a_v); + b_v = _mm256_rsqrt_ps( a_v ); // Note: It is quicker to just call div_ps and sqrt_ps if more // refinement desired! - b.v = _mm256_add_ps( b_v, _mm256_mul_ps( _mm256_set1_ps( 0.5f ), - _mm256_sub_ps( b_v, - _mm256_mul_ps( a_v, - _mm256_mul_ps( b_v, - _mm256_mul_ps( b_v, b_v ) ) ) ) ) ); + b.v = _mm256_add_ps( + b_v, _mm256_mul_ps( + _mm256_set1_ps( 0.5f ), + _mm256_sub_ps( + b_v, _mm256_mul_ps( + a_v, _mm256_mul_ps( + b_v, _mm256_mul_ps( b_v, b_v ) ) ) ) ) ); return b; - } +} - #if 0 +#if 0 inline v8float rsqrt( const v8float &a ) { v8float b; @@ -1544,9 +1550,9 @@ namespace v8 return b; } - #endif +#endif - #if 0 +#if 0 inline v8float rsqrt( const v8float &a ) { v8float b; @@ -1556,18 +1562,18 @@ namespace v8 return b; } - #endif +#endif - inline v8float rcp_approx( const v8float &a ) - { +inline v8float rcp_approx( const v8float& a ) +{ v8float b; b.v = _mm256_rcp_ps( a.v ); return b; - } +} - #if 0 +#if 0 inline v8float rcp( const v8float &a ) { v8float b; @@ -1583,21 +1589,21 @@ namespace v8 return b; } - #endif +#endif - inline v8float rcp( const v8float &a ) - { +inline v8float rcp( const v8float& a ) +{ v8float b; __m256 a_v = a.v, b_v; b_v = _mm256_rcp_ps( a_v ); b.v = _mm256_sub_ps( _mm256_add_ps( b_v, b_v ), - _mm256_mul_ps( a_v, _mm256_mul_ps( b_v, b_v ) ) ); + _mm256_mul_ps( a_v, _mm256_mul_ps( b_v, b_v ) ) ); return b; - } +} - #if 0 +#if 0 inline v8float rcp( const v8float &a ) { v8float b; @@ -1606,10 +1612,10 @@ namespace v8 return b; } - #endif +#endif - inline v8float fma( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fma( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.v = _mm256_add_ps( _mm256_mul_ps( a.v, b.v ), c.v ); @@ -1617,10 +1623,10 @@ namespace v8 // d.v = _mm256_fmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v8float fms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.v = _mm256_sub_ps( _mm256_mul_ps( a.v, b.v ), c.v ); @@ -1628,10 +1634,10 @@ namespace v8 // d.v = _mm256_fmsub_ps( a.v, b.v, c.v ); return d; - } +} - inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fnms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.v = _mm256_sub_ps( c.v, _mm256_mul_ps( a.v, b.v ) ); @@ -1639,49 +1645,49 @@ namespace v8 // d.v = _mm256_fnmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v8float clear_bits( const v8int &m, const v8float &a ) - { +inline v8float clear_bits( const v8int& m, const v8float& a ) +{ v8float b; b.v = _mm256_andnot_ps( m.v, a.v ); return b; - } +} - inline v8float set_bits( const v8int &m, const v8float &a ) - { +inline v8float set_bits( const v8int& m, const v8float& a ) +{ v8float b; b.v = _mm256_or_ps( m.v, a.v ); return b; - } +} - inline v8float toggle_bits( const v8int &m, const v8float &a ) - { +inline v8float toggle_bits( const v8int& m, const v8float& a ) +{ v8float b; b.v = _mm256_xor_ps( m.v, a.v ); return b; - } +} - inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void increment_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ _mm256_store_ps( p, _mm256_add_ps( _mm256_load_ps( p ), a.v ) ); - } +} - inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void decrement_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ _mm256_store_ps( p, _mm256_sub_ps( _mm256_load_ps( p ), a.v ) ); - } +} - inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void scale_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ _mm256_store_ps( p, _mm256_mul_ps( _mm256_load_ps( p ), a.v ) ); - } +} } // namespace v8 diff --git a/src/util/v8/v8_avx2.h b/src/util/v8/v8_avx2.h index 08ba6843..723683f1 100644 --- a/src/util/v8/v8_avx2.h +++ b/src/util/v8/v8_avx2.h @@ -12,209 +12,182 @@ #define V8_AVX2_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) // Why does GNU not define this function? // #ifdef __GNUC__ #ifndef __INTEL_COMPILER -#define _mm256_set_m128(va, vb) \ - _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1) +#define _mm256_set_m128( va, vb ) \ + _mm256_insertf128_ps( _mm256_castps128_ps256( vb ), va, 1 ) #endif namespace v8 { - class v8; - class v8int; - class v8float; +class v8; +class v8int; +class v8float; - //////////////// - // v8 base class +//////////////// +// v8 base class - class v8 - { +class v8 +{ friend class v8int; friend class v8float; // v8 miscellaneous friends - friend inline int any( const v8 &a ) ALWAYS_INLINE; - friend inline int all( const v8 &a ) ALWAYS_INLINE; + friend inline int any( const v8& a ) ALWAYS_INLINE; + friend inline int all( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 splat( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 splat( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 shuffle( const v8& a ) ALWAYS_INLINE; - friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE; - friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE; + friend inline void swap( v8& a, v8& b ) ALWAYS_INLINE; + friend inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, + v8& a5, v8& a6, v8& a7 ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& a, + const v8& b ) ALWAYS_INLINE; // v8 memory manipulation friends - friend inline void load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE; - friend inline void store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void load_8x1( const void* ALIGNED( 16 ) p, + v8& a ) ALWAYS_INLINE; + friend inline void store_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void stream_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void clear_8x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; + friend inline void copy_8x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; + friend inline void swap_8x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v8 transposed memory manipulation friends // Note: Half aligned values are permissible in the 8x2_tr variants. - friend inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) ALWAYS_INLINE; - - friend inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) ALWAYS_INLINE; - - friend inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE; - - friend inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE; - - friend inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE; - - friend inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE; - - friend inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3, - void * ALIGNED(8) a4, - void * ALIGNED(8) a5, - void * ALIGNED(8) a6, - void * ALIGNED(8) a7 ) ALWAYS_INLINE; - - friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x4_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x8_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - const v8 &e, const v8 &f, - const v8 &g, const v8 &h, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; + friend inline void load_8x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + const void* a4, const void* a5, + const void* a6, const void* a7, + v8& a ) ALWAYS_INLINE; + + friend inline void + load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, + v8& a, v8& b ) ALWAYS_INLINE; + + friend inline void + load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c ) ALWAYS_INLINE; + + friend inline void + load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d ) ALWAYS_INLINE; + + friend inline void + load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, + v8& h ) ALWAYS_INLINE; + + friend inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, + void* a3, void* a4, void* a5, void* a6, + void* a7 ) ALWAYS_INLINE; + + friend inline void + store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) ALWAYS_INLINE; + + friend inline void + store_8x3_tr( const v8& a, const v8& b, const v8& c, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, + void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x4_tr( + const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x8_tr( + const v8& a, const v8& b, const v8& c, const v8& d, const v8& e, + const v8& f, const v8& g, const v8& h, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; protected: - - union - { - int i[8]; - float f[8]; - __m256 v; + union { + int i[8]; + float f[8]; + __m256 v; }; public: + v8() {} // Default constructor - v8() {} // Default constructor - - v8( const v8 &a ) // Copy constructor + v8( const v8& a ) // Copy constructor { - v = a.v; + v = a.v; } - ~v8() {} // Default destructor - }; + ~v8() {} // Default destructor +}; - // v8 miscellaneous functions +// v8 miscellaneous functions - inline int any( const v8 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3] || - a.i[4] || a.i[5] || a.i[6] || a.i[7]; - } +inline int any( const v8& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7]; +} - inline int all( const v8 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3] && - a.i[4] && a.i[5] && a.i[6] && a.i[7]; - } +inline int all( const v8& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7]; +} - template - inline v8 splat( const v8 & a ) - { +template +inline v8 splat( const v8& a ) +{ v8 b; b.v = _mm256_set1_ps( a.v[n] ); return b; - } +} - template - inline v8 shuffle( const v8 & a ) - { +template +inline v8 shuffle( const v8& a ) +{ v8 b; b.i[0] = a.i[i0]; @@ -227,20 +200,20 @@ namespace v8 b.i[7] = a.i[i7]; return b; - } +} - inline void swap( v8 &a, v8 &b ) - { +inline void swap( v8& a, v8& b ) +{ __m256 a_v = a.v; a.v = b.v; b.v = a_v; - } +} - inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) - { +inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, v8& a5, v8& a6, + v8& a7 ) +{ __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 u0, u1, u2, u3, u4, u5, u6, u7; @@ -271,160 +244,150 @@ namespace v8 a5.v = _mm256_permute2f128_ps( u1, u5, 0x31 ); a6.v = _mm256_permute2f128_ps( u2, u6, 0x31 ); a7.v = _mm256_permute2f128_ps( u3, u7, 0x31 ); - } - - // v8 memory manipulation functions - - inline void load_8x1( const void * ALIGNED(16) p, - v8 &a ) - { - a.i[0] = ((const int * ALIGNED(16))p)[0]; - a.i[1] = ((const int * ALIGNED(16))p)[1]; - a.i[2] = ((const int * ALIGNED(16))p)[2]; - a.i[3] = ((const int * ALIGNED(16))p)[3]; - a.i[4] = ((const int * ALIGNED(16))p)[4]; - a.i[5] = ((const int * ALIGNED(16))p)[5]; - a.i[6] = ((const int * ALIGNED(16))p)[6]; - a.i[7] = ((const int * ALIGNED(16))p)[7]; - } - - inline void store_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void stream_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void clear_8x1( void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = 0; - ((int * ALIGNED(16))p)[1] = 0; - ((int * ALIGNED(16))p)[2] = 0; - ((int * ALIGNED(16))p)[3] = 0; - ((int * ALIGNED(16))p)[4] = 0; - ((int * ALIGNED(16))p)[5] = 0; - ((int * ALIGNED(16))p)[6] = 0; - ((int * ALIGNED(16))p)[7] = 0; - } +} - // FIXME: Ordering semantics - inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0]; - ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1]; - ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2]; - ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3]; - ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4]; - ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5]; - ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6]; - ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7]; - } +// v8 memory manipulation functions - inline void swap_8x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +inline void load_8x1( const void* ALIGNED( 16 ) p, v8& a ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 16 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 16 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 16 ))p )[3]; + a.i[4] = ( (const int* ALIGNED( 16 ))p )[4]; + a.i[5] = ( (const int* ALIGNED( 16 ))p )[5]; + a.i[6] = ( (const int* ALIGNED( 16 ))p )[6]; + a.i[7] = ( (const int* ALIGNED( 16 ))p )[7]; +} + +inline void store_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void stream_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void clear_8x1( void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = 0; + ( (int* ALIGNED( 16 ))p )[1] = 0; + ( (int* ALIGNED( 16 ))p )[2] = 0; + ( (int* ALIGNED( 16 ))p )[3] = 0; + ( (int* ALIGNED( 16 ))p )[4] = 0; + ( (int* ALIGNED( 16 ))p )[5] = 0; + ( (int* ALIGNED( 16 ))p )[6] = 0; + ( (int* ALIGNED( 16 ))p )[7] = 0; +} + +// FIXME: Ordering semantics +inline void copy_8x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + ( (int* ALIGNED( 16 ))dst )[0] = ( (const int* ALIGNED( 16 ))src )[0]; + ( (int* ALIGNED( 16 ))dst )[1] = ( (const int* ALIGNED( 16 ))src )[1]; + ( (int* ALIGNED( 16 ))dst )[2] = ( (const int* ALIGNED( 16 ))src )[2]; + ( (int* ALIGNED( 16 ))dst )[3] = ( (const int* ALIGNED( 16 ))src )[3]; + ( (int* ALIGNED( 16 ))dst )[4] = ( (const int* ALIGNED( 16 ))src )[4]; + ( (int* ALIGNED( 16 ))dst )[5] = ( (const int* ALIGNED( 16 ))src )[5]; + ( (int* ALIGNED( 16 ))dst )[6] = ( (const int* ALIGNED( 16 ))src )[6]; + ( (int* ALIGNED( 16 ))dst )[7] = ( (const int* ALIGNED( 16 ))src )[7]; +} + +inline void swap_8x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; - t = ((int * ALIGNED(16))a)[0]; - ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; - ((int * ALIGNED(16))b)[0] = t; + t = ( (int* ALIGNED( 16 ))a )[0]; + ( (int* ALIGNED( 16 ))a )[0] = ( (int* ALIGNED( 16 ))b )[0]; + ( (int* ALIGNED( 16 ))b )[0] = t; - t = ((int * ALIGNED(16))a)[1]; - ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; - ((int * ALIGNED(16))b)[1] = t; + t = ( (int* ALIGNED( 16 ))a )[1]; + ( (int* ALIGNED( 16 ))a )[1] = ( (int* ALIGNED( 16 ))b )[1]; + ( (int* ALIGNED( 16 ))b )[1] = t; - t = ((int * ALIGNED(16))a)[2]; - ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; - ((int * ALIGNED(16))b)[2] = t; + t = ( (int* ALIGNED( 16 ))a )[2]; + ( (int* ALIGNED( 16 ))a )[2] = ( (int* ALIGNED( 16 ))b )[2]; + ( (int* ALIGNED( 16 ))b )[2] = t; - t = ((int * ALIGNED(16))a)[3]; - ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; - ((int * ALIGNED(16))b)[3] = t; + t = ( (int* ALIGNED( 16 ))a )[3]; + ( (int* ALIGNED( 16 ))a )[3] = ( (int* ALIGNED( 16 ))b )[3]; + ( (int* ALIGNED( 16 ))b )[3] = t; - t = ((int * ALIGNED(16))a)[4]; - ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4]; - ((int * ALIGNED(16))b)[4] = t; + t = ( (int* ALIGNED( 16 ))a )[4]; + ( (int* ALIGNED( 16 ))a )[4] = ( (int* ALIGNED( 16 ))b )[4]; + ( (int* ALIGNED( 16 ))b )[4] = t; - t = ((int * ALIGNED(16))a)[5]; - ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5]; - ((int * ALIGNED(16))b)[5] = t; + t = ( (int* ALIGNED( 16 ))a )[5]; + ( (int* ALIGNED( 16 ))a )[5] = ( (int* ALIGNED( 16 ))b )[5]; + ( (int* ALIGNED( 16 ))b )[5] = t; - t = ((int * ALIGNED(16))a)[6]; - ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6]; - ((int * ALIGNED(16))b)[6] = t; + t = ( (int* ALIGNED( 16 ))a )[6]; + ( (int* ALIGNED( 16 ))a )[6] = ( (int* ALIGNED( 16 ))b )[6]; + ( (int* ALIGNED( 16 ))b )[6] = t; - t = ((int * ALIGNED(16))a)[7]; - ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7]; - ((int * ALIGNED(16))b)[7] = t; - } + t = ( (int* ALIGNED( 16 ))a )[7]; + ( (int* ALIGNED( 16 ))a )[7] = ( (int* ALIGNED( 16 ))b )[7]; + ( (int* ALIGNED( 16 ))b )[7] = t; +} - // v8 transposed memory manipulation functions +// v8 transposed memory manipulation functions - inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) - { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; - a.i[4] = ((const int *)a4)[0]; - a.i[5] = ((const int *)a5)[0]; - a.i[6] = ((const int *)a6)[0]; - a.i[7] = ((const int *)a7)[0]; - } - - inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) - { +inline void load_8x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, const void* a4, const void* a5, + const void* a6, const void* a7, v8& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; + a.i[4] = ( (const int*)a4 )[0]; + a.i[5] = ( (const int*)a5 )[0]; + a.i[6] = ( (const int*)a6 )[0]; + a.i[7] = ( (const int*)a7 )[0]; +} + +inline void +load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, v8& a, + v8& b ) +{ __m128 zero; __m128 t0, t1, t2, t3; __m256 u0, u1; zero = _mm_setzero_ps(); - t0 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a0 ), (__m64 *)a1 ); - t1 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a2 ), (__m64 *)a3 ); - t2 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a4 ), (__m64 *)a5 ); - t3 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a6 ), (__m64 *)a7 ); + t0 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a0 ), (__m64*)a1 ); + t1 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a2 ), (__m64*)a3 ); + t2 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a4 ), (__m64*)a5 ); + t3 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64*)a6 ), (__m64*)a7 ); u0 = _mm256_set_m128( t2, t0 ); u1 = _mm256_set_m128( t3, t1 ); a.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 2, 0, 2, 0 ) ); b.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 3, 1, 3, 1 ) ); - } +} - #if 0 +#if 0 // This is an alternate AVX-2 implementation. inline void load_8x2_tr( const void * ALIGNED(8) a0, const void * ALIGNED(8) a1, @@ -461,71 +424,65 @@ namespace v8 b0.v = _mm256_permute2f128_ps( t0, t4, 0x20 ); b1.v = _mm256_permute2f128_ps( t1, t5, 0x20 ); } - #endif - - inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - } +#endif - inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) - { +inline void +load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; +} + +inline void +load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d ) +{ __m256 tmp0, tmp1, tmp2, tmp3; - a.v = _mm256_set_m128( _mm_load_ps( (const float *)a4 ), - _mm_load_ps( (const float *)a0 ) ); - b.v = _mm256_set_m128( _mm_load_ps( (const float *)a5 ), - _mm_load_ps( (const float *)a1 ) ); - c.v = _mm256_set_m128( _mm_load_ps( (const float *)a6 ), - _mm_load_ps( (const float *)a2 ) ); - d.v = _mm256_set_m128( _mm_load_ps( (const float *)a7 ), - _mm_load_ps( (const float *)a3 ) ); + a.v = _mm256_set_m128( _mm_load_ps( (const float*)a4 ), + _mm_load_ps( (const float*)a0 ) ); + b.v = _mm256_set_m128( _mm_load_ps( (const float*)a5 ), + _mm_load_ps( (const float*)a1 ) ); + c.v = _mm256_set_m128( _mm_load_ps( (const float*)a6 ), + _mm_load_ps( (const float*)a2 ) ); + d.v = _mm256_set_m128( _mm_load_ps( (const float*)a7 ), + _mm_load_ps( (const float*)a3 ) ); tmp0 = _mm256_shuffle_ps( a.v, b.v, 0x44 ); tmp2 = _mm256_shuffle_ps( a.v, b.v, 0xEE ); @@ -536,30 +493,26 @@ namespace v8 b.v = _mm256_shuffle_ps( tmp0, tmp1, 0xDD ); c.v = _mm256_shuffle_ps( tmp2, tmp3, 0x88 ); d.v = _mm256_shuffle_ps( tmp2, tmp3, 0xDD ); - } - - // This is a cleaner reference AVX-2 implementation. - inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &b0, v8 &b1, v8 &b2, v8 &b3, - v8 &b4, v8 &b5, v8 &b6, v8 &b7 ) - { +} + +// This is a cleaner reference AVX-2 implementation. +inline void +load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& b0, + v8& b1, v8& b2, v8& b3, v8& b4, v8& b5, v8& b6, v8& b7 ) +{ __m256 t0, t1, t2, t3, t4, t5, t6, t7; - t0 = _mm256_load_ps( (const float *)a0 ); - t1 = _mm256_load_ps( (const float *)a1 ); - t2 = _mm256_load_ps( (const float *)a2 ); - t3 = _mm256_load_ps( (const float *)a3 ); - t4 = _mm256_load_ps( (const float *)a4 ); - t5 = _mm256_load_ps( (const float *)a5 ); - t6 = _mm256_load_ps( (const float *)a6 ); - t7 = _mm256_load_ps( (const float *)a7 ); + t0 = _mm256_load_ps( (const float*)a0 ); + t1 = _mm256_load_ps( (const float*)a1 ); + t2 = _mm256_load_ps( (const float*)a2 ); + t3 = _mm256_load_ps( (const float*)a3 ); + t4 = _mm256_load_ps( (const float*)a4 ); + t5 = _mm256_load_ps( (const float*)a5 ); + t6 = _mm256_load_ps( (const float*)a6 ); + t7 = _mm256_load_ps( (const float*)a7 ); b0.v = _mm256_unpacklo_ps( t0, t1 ); b1.v = _mm256_unpackhi_ps( t0, t1 ); @@ -570,14 +523,14 @@ namespace v8 b6.v = _mm256_unpacklo_ps( t6, t7 ); b7.v = _mm256_unpackhi_ps( t6, t7 ); - t0 = _mm256_shuffle_ps( b0.v, b2.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - t1 = _mm256_shuffle_ps( b0.v, b2.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - t2 = _mm256_shuffle_ps( b1.v, b3.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - t3 = _mm256_shuffle_ps( b1.v, b3.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - t4 = _mm256_shuffle_ps( b4.v, b6.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - t5 = _mm256_shuffle_ps( b4.v, b6.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - t6 = _mm256_shuffle_ps( b5.v, b7.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - t7 = _mm256_shuffle_ps( b5.v, b7.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + t0 = _mm256_shuffle_ps( b0.v, b2.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + t1 = _mm256_shuffle_ps( b0.v, b2.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + t2 = _mm256_shuffle_ps( b1.v, b3.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + t3 = _mm256_shuffle_ps( b1.v, b3.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + t4 = _mm256_shuffle_ps( b4.v, b6.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + t5 = _mm256_shuffle_ps( b4.v, b6.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + t6 = _mm256_shuffle_ps( b5.v, b7.v, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + t7 = _mm256_shuffle_ps( b5.v, b7.v, _MM_SHUFFLE( 3, 2, 3, 2 ) ); b0.v = _mm256_permute2f128_ps( t0, t4, 0x20 ); b1.v = _mm256_permute2f128_ps( t1, t5, 0x20 ); @@ -587,9 +540,9 @@ namespace v8 b5.v = _mm256_permute2f128_ps( t1, t5, 0x31 ); b6.v = _mm256_permute2f128_ps( t2, t6, 0x31 ); b7.v = _mm256_permute2f128_ps( t3, t7, 0x31 ); - } +} - #if 0 +#if 0 // This is the reference AVX-2 implementation. inline void load_8x8_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, @@ -642,9 +595,9 @@ namespace v8 g.v = _mm256_permute2f128_ps( u2, u6, 0x31 ); h.v = _mm256_permute2f128_ps( u3, u7, 0x31 ); } - #endif +#endif - #if 0 +#if 0 // Replace _mm256_load_ps with _mm256_insertf128_ps. inline void load_8x8_tr( const void * ALIGNED(16) a0, const void * ALIGNED(16) a1, @@ -686,9 +639,9 @@ namespace v8 g.v = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) ); h.v = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) ); } - #endif +#endif - #if 0 +#if 0 // Replace _mm256_load_ps with _mm256_insertf128_ps. Replace two calls to // _mm256_shuffle_ps with one call to _mm256_shuffle_ps and two calls to // _mm256_blend_ps. @@ -745,28 +698,27 @@ namespace v8 g.v = _mm256_blend_ps( t5, v0, 0xCC ); h.v = _mm256_blend_ps( t7, v0, 0x33 ); } - #endif - - inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) - { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; - ((int *)a4)[0] = a.i[4]; - ((int *)a5)[0] = a.i[5]; - ((int *)a6)[0] = a.i[6]; - ((int *)a7)[0] = a.i[7]; - } +#endif - inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3, - void * ALIGNED(8) a4, void * ALIGNED(8) a5, - void * ALIGNED(8) a6, void * ALIGNED(8) a7 ) - { +inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, void* a3, + void* a4, void* a5, void* a6, void* a7 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; + ( (int*)a4 )[0] = a.i[4]; + ( (int*)a5 )[0] = a.i[5]; + ( (int*)a6 )[0] = a.i[6]; + ( (int*)a7 )[0] = a.i[7]; +} + +inline void store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) +{ __m256 u0, u1; __m128 t0, t1, t2, t3; @@ -778,64 +730,64 @@ namespace v8 t2 = _mm256_extractf128_ps( u0, 1 ); t3 = _mm256_extractf128_ps( u1, 1 ); - _mm_storel_pi( (__m64 *) a0, t0 ); - _mm_storeh_pi( (__m64 *) a1, t0 ); - - _mm_storel_pi( (__m64 *) a2, t1 ); - _mm_storeh_pi( (__m64 *) a3, t1 ); - - _mm_storel_pi( (__m64 *) a4, t2 ); - _mm_storeh_pi( (__m64 *) a5, t2 ); - - _mm_storel_pi( (__m64 *) a6, t3 ); - _mm_storeh_pi( (__m64 *) a7, t3 ); - } - - inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; + _mm_storel_pi( (__m64*)a0, t0 ); + _mm_storeh_pi( (__m64*)a1, t0 ); - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; + _mm_storel_pi( (__m64*)a2, t1 ); + _mm_storeh_pi( (__m64*)a3, t1 ); - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; + _mm_storel_pi( (__m64*)a4, t2 ); + _mm_storeh_pi( (__m64*)a5, t2 ); - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; + _mm_storel_pi( (__m64*)a6, t3 ); + _mm_storeh_pi( (__m64*)a7, t3 ); +} - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - } - - inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { +inline void store_8x3_tr( const v8& a, const v8& b, const v8& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; +} + +inline void store_8x4_tr( const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ __m256 u0, u1, u2, u3; __m256 t0, t1, t2, t3; __m128 s0, s1, s2, s3, s4, s5, s6, s7; @@ -860,23 +812,23 @@ namespace v8 s6 = _mm256_extractf128_ps( t2, 1 ); s7 = _mm256_extractf128_ps( t3, 1 ); - _mm_store_ps( (float *) a0, s0 ); - _mm_store_ps( (float *) a1, s1 ); - _mm_store_ps( (float *) a2, s2 ); - _mm_store_ps( (float *) a3, s3 ); - _mm_store_ps( (float *) a4, s4 ); - _mm_store_ps( (float *) a5, s5 ); - _mm_store_ps( (float *) a6, s6 ); - _mm_store_ps( (float *) a7, s7 ); - } - - inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - const v8 &e, const v8 &f, const v8 &g, const v8 &h, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { + _mm_store_ps( (float*)a0, s0 ); + _mm_store_ps( (float*)a1, s1 ); + _mm_store_ps( (float*)a2, s2 ); + _mm_store_ps( (float*)a3, s3 ); + _mm_store_ps( (float*)a4, s4 ); + _mm_store_ps( (float*)a5, s5 ); + _mm_store_ps( (float*)a6, s6 ); + _mm_store_ps( (float*)a7, s7 ); +} + +inline void store_8x8_tr( const v8& a, const v8& b, const v8& c, const v8& d, + const v8& e, const v8& f, const v8& g, const v8& h, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 u0, u1, u2, u3, u4, u5, u6, u7; @@ -908,253 +860,276 @@ namespace v8 t6 = _mm256_permute2f128_ps( u2, u6, 0x31 ); t7 = _mm256_permute2f128_ps( u3, u7, 0x31 ); - _mm256_store_ps( (float *)a0, t0 ); - _mm256_store_ps( (float *)a1, t1 ); - _mm256_store_ps( (float *)a2, t2 ); - _mm256_store_ps( (float *)a3, t3 ); - _mm256_store_ps( (float *)a4, t4 ); - _mm256_store_ps( (float *)a5, t5 ); - _mm256_store_ps( (float *)a6, t6 ); - _mm256_store_ps( (float *)a7, t7 ); - } + _mm256_store_ps( (float*)a0, t0 ); + _mm256_store_ps( (float*)a1, t1 ); + _mm256_store_ps( (float*)a2, t2 ); + _mm256_store_ps( (float*)a3, t3 ); + _mm256_store_ps( (float*)a4, t4 ); + _mm256_store_ps( (float*)a5, t5 ); + _mm256_store_ps( (float*)a6, t6 ); + _mm256_store_ps( (float*)a7, t7 ); +} - ////////////// - // v8int class +////////////// +// v8int class - class v8int : public v8 - { +class v8int : public v8 +{ // v8int prefix unary operator friends - friend inline v8int operator +( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator ~( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8int & a ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator~( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8int prefix increment / decrement operator friends - friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a ) ALWAYS_INLINE; // v8int postfix increment / decrement operator friends - friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a, int ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a, int ) ALWAYS_INLINE; // v8int binary operator friends - friend inline v8int operator +( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator *( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator /( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator %( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ^( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator |( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator*(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator/( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator%( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator^( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator|( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int logical operator friends - friend inline v8int operator <( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8int abs( const v8int &a ) ALWAYS_INLINE; - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; + friend inline v8int abs( const v8int& a ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& t, + const v8& f ) ALWAYS_INLINE; // v8float unary operator friends - friend inline v8int operator !( const v8float & a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float miscellaneous friends - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; public: - // v8int constructors / destructors - v8int() {} // Default constructor + v8int() {} // Default constructor - v8int( const v8int &a ) // Copy constructor + v8int( const v8int& a ) // Copy constructor { - v = a.v; + v = a.v; } - v8int( const v8 &a ) // Init from mixed + v8int( const v8& a ) // Init from mixed { - v = a.v; + v = a.v; } - v8int( int a ) // Init from scalar + v8int( int a ) // Init from scalar { - union - { - int i; - float f; - } u; - u.i = a; - v = _mm256_set1_ps( u.f ); + union { + int i; + float f; + } u; + u.i = a; + v = _mm256_set1_ps( u.f ); } - v8int( int i0, int i1, int i2, int i3, - int i4, int i5, int i6, int i7 ) // Init from scalars + v8int( int i0, int i1, int i2, int i3, int i4, int i5, int i6, + int i7 ) // Init from scalars { - union - { - int i; - float f; - } u0, u1, u2, u3, u4, u5, u6, u7; - - u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3; - u4.i = i4; u5.i = i5; u6.i = i6; u7.i = i7; - - v = _mm256_setr_ps( u0.f, u1.f, u2.f, u3.f, - u4.f, u5.f, u6.f, u7.f ); + union { + int i; + float f; + } u0, u1, u2, u3, u4, u5, u6, u7; + + u0.i = i0; + u1.i = i1; + u2.i = i2; + u3.i = i3; + u4.i = i4; + u5.i = i5; + u6.i = i6; + u7.i = i7; + + v = _mm256_setr_ps( u0.f, u1.f, u2.f, u3.f, u4.f, u5.f, u6.f, u7.f ); } - ~v8int() {} // Destructor + ~v8int() {} // Destructor // v8int assignment operators -# define ASSIGN(op) \ - inline v8int &operator op( const v8int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - i[4] op b.i[4]; \ - i[5] op b.i[5]; \ - i[6] op b.i[6]; \ - i[7] op b.i[7]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8int& operator op( const v8int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + i[4] op b.i[4]; \ + i[5] op b.i[5]; \ + i[6] op b.i[6]; \ + i[7] op b.i[7]; \ + return *this; \ } - inline v8int &operator =( const v8int &b ) + inline v8int& operator=( const v8int& b ) { - v = b.v; - return *this; + v = b.v; + return *this; } - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) - inline v8int &operator ^=( const v8int &b ) + inline v8int& operator^=( const v8int& b ) { - v = _mm256_xor_ps( v, b.v ); - return *this; + v = _mm256_xor_ps( v, b.v ); + return *this; } - inline v8int &operator &=( const v8int &b ) + inline v8int& operator&=( const v8int& b ) { - v = _mm256_and_ps( v, b.v ); - return *this; + v = _mm256_and_ps( v, b.v ); + return *this; } - inline v8int &operator |=( const v8int &b ) + inline v8int& operator|=( const v8int& b ) { - v = _mm256_or_ps( v, b.v ); - return *this; + v = _mm256_or_ps( v, b.v ); + return *this; } - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v8int member access operator - inline int &operator []( int n ) - { - return i[n]; - } - - inline int operator ()( int n ) - { - return i[n]; + inline int& operator[]( int n ) { return i[n]; } + + inline int operator()( int n ) { return i[n]; } +}; + +// v8int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v8int operator op( const v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ } - }; - - // v8int prefix unary operators - -# define PREFIX_UNARY(op) \ - inline v8int operator op( const v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } - inline v8int operator +( const v8int & a ) - { +inline v8int operator+( const v8int& a ) +{ v8int b; b.v = a.v; return b; - } +} - PREFIX_UNARY(-) +PREFIX_UNARY( -) - inline v8int operator !( const v8int & a ) - { +inline v8int operator!( const v8int& a ) +{ v8int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); - b.i[4] = - ( !a.i[4] ); - b.i[5] = - ( !a.i[5] ); - b.i[6] = - ( !a.i[6] ); - b.i[7] = - ( !a.i[7] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); + b.i[4] = -( !a.i[4] ); + b.i[5] = -( !a.i[5] ); + b.i[6] = -( !a.i[6] ); + b.i[7] = -( !a.i[7] ); return b; - } +} - inline v8int operator ~( const v8int & a ) - { +inline v8int operator~( const v8int& a ) +{ v8int b; - union - { - int i; - float f; + union { + int i; + float f; } u; u.i = -1; @@ -1162,141 +1137,141 @@ namespace v8 b.v = _mm256_xor_ps( a.v, _mm256_set1_ps( u.f ) ); return b; - } - -# undef PREFIX_UNARY - - // v8int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v8int operator op( v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } +} + +#undef PREFIX_UNARY + +// v8int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ + } - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v8int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v8int operator op( v8int & a, int ) \ - { \ - v8int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - b.i[4] = ( a.i[4] op ); \ - b.i[5] = ( a.i[5] op ); \ - b.i[6] = ( a.i[6] op ); \ - b.i[7] = ( a.i[7] op ); \ - return b; \ - } +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v8int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a, int ) \ + { \ + v8int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + b.i[4] = ( a.i[4] op ); \ + b.i[5] = ( a.i[5] op ); \ + b.i[6] = ( a.i[6] op ); \ + b.i[7] = ( a.i[7] op ); \ + return b; \ + } - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v8int binary operators - -# define BINARY(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - c.i[4] = a.i[4] op b.i[4]; \ - c.i[5] = a.i[5] op b.i[5]; \ - c.i[6] = a.i[6] op b.i[6]; \ - c.i[7] = a.i[7] op b.i[7]; \ - return c; \ - } +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v8int binary operators + +#define BINARY( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + c.i[4] = a.i[4] op b.i[4]; \ + c.i[5] = a.i[5] op b.i[5]; \ + c.i[6] = a.i[6] op b.i[6]; \ + c.i[7] = a.i[7] op b.i[7]; \ + return c; \ + } - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) - inline v8int operator ^( const v8int &a, const v8int &b ) - { +inline v8int operator^( const v8int& a, const v8int& b ) +{ v8int c; c.v = _mm256_xor_ps( a.v, b.v ); return c; - } +} - inline v8int operator &( const v8int &a, const v8int &b ) - { +inline v8int operator&( const v8int& a, const v8int& b ) +{ v8int c; c.v = _mm256_and_ps( a.v, b.v ); return c; - } +} - inline v8int operator |( const v8int &a, const v8int &b ) - { +inline v8int operator|( const v8int& a, const v8int& b ) +{ v8int c; c.v = _mm256_or_ps( a.v, b.v ); return c; - } - - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v8int logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - c.i[4] = - ( a.i[4] op b.i[4] ); \ - c.i[5] = - ( a.i[5] op b.i[5] ); \ - c.i[6] = - ( a.i[6] op b.i[6] ); \ - c.i[7] = - ( a.i[7] op b.i[7] ); \ - return c; \ - } +} + +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v8int logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + c.i[4] = -( a.i[4] op b.i[4] ); \ + c.i[5] = -( a.i[5] op b.i[5] ); \ + c.i[6] = -( a.i[6] op b.i[6] ); \ + c.i[7] = -( a.i[7] op b.i[7] ); \ + return c; \ + } - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) -# undef LOGICAL +#undef LOGICAL - // v8int miscellaneous functions +// v8int miscellaneous functions - inline v8int abs( const v8int &a ) - { +inline v8int abs( const v8int& a ) +{ v8int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -1309,209 +1284,238 @@ namespace v8 b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7]; return b; - } +} - inline v8 czero( const v8int &c, const v8 &a ) - { +inline v8 czero( const v8int& c, const v8& a ) +{ v8 b; b.v = _mm256_andnot_ps( c.v, a.v ); return b; - } +} - inline v8 notczero( const v8int &c, const v8 &a ) - { +inline v8 notczero( const v8int& c, const v8& a ) +{ v8 b; b.v = _mm256_and_ps( c.v, a.v ); return b; - } +} - inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) - { +inline v8 merge( const v8int& c, const v8& t, const v8& f ) +{ __m256 c_v = c.v; v8 tf; - tf.v = _mm256_or_ps( _mm256_andnot_ps( c_v, f.v ), - _mm256_and_ps( c_v, t.v ) ); + tf.v = + _mm256_or_ps( _mm256_andnot_ps( c_v, f.v ), _mm256_and_ps( c_v, t.v ) ); return tf; - } +} - //////////////// - // v8float class +//////////////// +// v8float class - class v8float : public v8 - { +class v8float : public v8 +{ // v8float prefix unary operator friends - friend inline v8float operator +( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator ~( const v8float &a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8float &a ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator~( const v8float& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8float prefix increment / decrement operator friends - friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a ) ALWAYS_INLINE; // v8float postfix increment / decrement operator friends - friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a, int ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a, int ) ALWAYS_INLINE; // v8float binary operator friends - friend inline v8float operator +( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator *( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator /( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator*(const v8float& a, + const v8float& b)ALWAYS_INLINE; + friend inline v8float operator/( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float math library friends -# define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v8float fn( const v8float &a, \ - const v8float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v8float fn( const v8float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v8float fn( const v8float& a, const v8float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v8float miscellaneous friends - friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rsqrt ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; + friend inline v8float rsqrt_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rsqrt( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp( const v8float& a ) ALWAYS_INLINE; + friend inline v8float fma( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fnms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline void increment_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void decrement_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void scale_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; public: - // v8float constructors / destructors - v8float() {} // Default constructor + v8float() {} // Default constructor - v8float( const v8float &a ) // Copy constructor + v8float( const v8float& a ) // Copy constructor { - v = a.v; + v = a.v; } - v8float( const v8 &a ) // Init from mixed + v8float( const v8& a ) // Init from mixed { - v = a.v; + v = a.v; } - v8float( float a ) // Init from scalar + v8float( float a ) // Init from scalar { - v = _mm256_set1_ps( a ); + v = _mm256_set1_ps( a ); } - v8float( float f0, float f1, float f2, float f3, - float f4, float f5, float f6, float f7 ) // Init from scalars + v8float( float f0, float f1, float f2, float f3, float f4, float f5, + float f6, float f7 ) // Init from scalars { - v = _mm256_setr_ps( f0, f1, f2, f3, f4, f5, f6, f7 ); + v = _mm256_setr_ps( f0, f1, f2, f3, f4, f5, f6, f7 ); } - ~v8float() {} // Destructor + ~v8float() {} // Destructor // v8float assignment operators -# define ASSIGN(op,intrin) \ - inline v8float &operator op( const v8float &b ) \ - { \ - v = intrin(v,b.v); \ - return *this; \ +#define ASSIGN( op, intrin ) \ + inline v8float& operator op( const v8float& b ) \ + { \ + v = intrin( v, b.v ); \ + return *this; \ } - inline v8float &operator =( const v8float &b ) + inline v8float& operator=( const v8float& b ) { - v = b.v; - return *this; + v = b.v; + return *this; } - ASSIGN(+=,_mm256_add_ps) - ASSIGN(-=,_mm256_sub_ps) - ASSIGN(*=,_mm256_mul_ps) - ASSIGN(/=,_mm256_div_ps) + ASSIGN( +=, _mm256_add_ps ) + ASSIGN( -=, _mm256_sub_ps ) + ASSIGN( *=, _mm256_mul_ps ) + ASSIGN( /=, _mm256_div_ps ) -# undef ASSIGN +#undef ASSIGN // v8float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v8float prefix unary operators +// v8float prefix unary operators - inline v8float operator +( const v8float &a ) - { +inline v8float operator+( const v8float& a ) +{ v8float b; b.v = a.v; return b; - } +} - inline v8float operator -( const v8float &a ) - { +inline v8float operator-( const v8float& a ) +{ v8float b; b.v = _mm256_sub_ps( _mm256_setzero_ps(), a.v ); return b; - } +} - inline v8int operator !( const v8float &a ) - { +inline v8int operator!( const v8float& a ) +{ v8int b; b.v = _mm256_cmp_ps( _mm256_setzero_ps(), a.v, _CMP_EQ_OS ); return b; - } +} - // v8float prefix increment / decrement operators +// v8float prefix increment / decrement operators - inline v8float operator ++( v8float &a ) - { +inline v8float operator++( v8float& a ) +{ v8float b; __m256 t = _mm256_add_ps( a.v, _mm256_set1_ps( 1.0f ) ); @@ -1519,10 +1523,10 @@ namespace v8 b.v = t; return b; - } +} - inline v8float operator --( v8float &a ) - { +inline v8float operator--( v8float& a ) +{ v8float b; __m256 t = _mm256_sub_ps( a.v, _mm256_set1_ps( 1.0f ) ); @@ -1530,12 +1534,12 @@ namespace v8 b.v = t; return b; - } +} - // v8float postfix increment / decrement operators +// v8float postfix increment / decrement operators - inline v8float operator ++( v8float &a, int ) - { +inline v8float operator++( v8float& a, int ) +{ v8float b; __m256 a_v = a.v; @@ -1543,159 +1547,159 @@ namespace v8 b.v = a_v; return b; - } +} - inline v8float operator --( v8float &a, int ) - { +inline v8float operator--( v8float& a, int ) +{ v8float b; __m256 a_v = a.v; - a.v = _mm256_sub_ps(a_v, _mm256_set1_ps( 1.0f ) ); + a.v = _mm256_sub_ps( a_v, _mm256_set1_ps( 1.0f ) ); b.v = a_v; return b; - } +} - // v8float binary operators +// v8float binary operators -# define BINARY(op,intrin) \ - inline v8float operator op( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.v = intrin( a.v, b.v ); \ - return c; \ - } +#define BINARY( op, intrin ) \ + inline v8float operator op( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.v = intrin( a.v, b.v ); \ + return c; \ + } - BINARY( +, _mm256_add_ps ) - BINARY( -, _mm256_sub_ps ) - BINARY( *, _mm256_mul_ps ) - BINARY( /, _mm256_div_ps ) +BINARY( +, _mm256_add_ps ) +BINARY( -, _mm256_sub_ps ) +BINARY( *, _mm256_mul_ps ) +BINARY( /, _mm256_div_ps ) -# undef BINARY +#undef BINARY - // v8float logical operators +// v8float logical operators -# define LOGICAL(op,intrin,flag) \ - inline v8int operator op( const v8float &a, const v8float &b ) \ - { \ - v8int c; \ - c.v = intrin( a.v, b.v, flag ); \ - return c; \ - } +#define LOGICAL( op, intrin, flag ) \ + inline v8int operator op( const v8float& a, const v8float& b ) \ + { \ + v8int c; \ + c.v = intrin( a.v, b.v, flag ); \ + return c; \ + } - LOGICAL( <, _mm256_cmp_ps, _CMP_LT_OS ) - LOGICAL( >, _mm256_cmp_ps, _CMP_GT_OS ) - LOGICAL( ==, _mm256_cmp_ps, _CMP_EQ_OS ) - LOGICAL( !=, _mm256_cmp_ps, _CMP_NEQ_OS ) - LOGICAL( <=, _mm256_cmp_ps, _CMP_LE_OS ) - LOGICAL( >=, _mm256_cmp_ps, _CMP_GE_OS ) +LOGICAL( <, _mm256_cmp_ps, _CMP_LT_OS ) +LOGICAL( >, _mm256_cmp_ps, _CMP_GT_OS ) +LOGICAL( ==, _mm256_cmp_ps, _CMP_EQ_OS ) +LOGICAL( !=, _mm256_cmp_ps, _CMP_NEQ_OS ) +LOGICAL( <=, _mm256_cmp_ps, _CMP_LE_OS ) +LOGICAL( >=, _mm256_cmp_ps, _CMP_GE_OS ) - inline v8int operator &&( const v8float &a, const v8float &b ) - { +inline v8int operator&&( const v8float& a, const v8float& b ) +{ v8int c; __m256 vzero = _mm256_setzero_ps(); c.v = _mm256_and_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ), - _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); + _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); return c; - } +} - inline v8int operator ||( const v8float &a, const v8float &b ) - { +inline v8int operator||( const v8float& a, const v8float& b ) +{ v8int c; __m256 vzero = _mm256_setzero_ps(); c.v = _mm256_or_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ), - _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); + _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) ); return c; - } - -# undef LOGICAL - - // v8float math library functions - -# define CMATH_FR1(fn) \ - inline v8float fn( const v8float &a ) \ - { \ - v8float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - b.f[4] = ::fn( a.f[4] ); \ - b.f[5] = ::fn( a.f[5] ); \ - b.f[6] = ::fn( a.f[6] ); \ - b.f[7] = ::fn( a.f[7] ); \ - return b; \ - } +} + +#undef LOGICAL + +// v8float math library functions + +#define CMATH_FR1( fn ) \ + inline v8float fn( const v8float& a ) \ + { \ + v8float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + b.f[4] = ::fn( a.f[4] ); \ + b.f[5] = ::fn( a.f[5] ); \ + b.f[6] = ::fn( a.f[6] ); \ + b.f[7] = ::fn( a.f[7] ); \ + return b; \ + } -# define CMATH_FR2(fn) \ - inline v8float fn( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - c.f[4] = ::fn( a.f[4], b.f[4] ); \ - c.f[5] = ::fn( a.f[5], b.f[5] ); \ - c.f[6] = ::fn( a.f[6], b.f[6] ); \ - c.f[7] = ::fn( a.f[7], b.f[7] ); \ - return c; \ - } +#define CMATH_FR2( fn ) \ + inline v8float fn( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + c.f[4] = ::fn( a.f[4], b.f[4] ); \ + c.f[5] = ::fn( a.f[5], b.f[5] ); \ + c.f[6] = ::fn( a.f[6], b.f[6] ); \ + c.f[7] = ::fn( a.f[7], b.f[7] ); \ + return c; \ + } - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan) CMATH_FR1(tanh) +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) + /*CMATH_FR1(fabs)*/ CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) + CMATH_FR1( log10 ) CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + /*CMATH_FR1(sqrt)*/ CMATH_FR1( tan ) CMATH_FR1( tanh ) - inline v8float fabs( const v8float &a ) - { + inline v8float fabs( const v8float& a ) +{ v8float b; b.v = _mm256_andnot_ps( _mm256_set1_ps( -0.0f ), a.v ); return b; - } +} - inline v8float sqrt( const v8float &a ) - { +inline v8float sqrt( const v8float& a ) +{ v8float b; b.v = _mm256_sqrt_ps( a.v ); return b; - } +} - inline v8float copysign( const v8float &a, const v8float &b ) - { +inline v8float copysign( const v8float& a, const v8float& b ) +{ v8float c; __m256 t = _mm256_set1_ps( -0.0f ); - c.v = _mm256_or_ps( _mm256_and_ps( t, b.v ), - _mm256_andnot_ps( t, a.v ) ); + c.v = _mm256_or_ps( _mm256_and_ps( t, b.v ), _mm256_andnot_ps( t, a.v ) ); return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v8float miscellaneous functions +// v8float miscellaneous functions - inline v8float rsqrt_approx( const v8float &a ) - { +inline v8float rsqrt_approx( const v8float& a ) +{ v8float b; - b.v = _mm256_rsqrt_ps(a.v); + b.v = _mm256_rsqrt_ps( a.v ); return b; - } +} - #if 0 +#if 0 inline v8float rsqrt( const v8float &a ) { v8float b; @@ -1711,27 +1715,29 @@ namespace v8 return b; } - #endif +#endif - inline v8float rsqrt( const v8float &a ) - { +inline v8float rsqrt( const v8float& a ) +{ v8float b; __m256 a_v = a.v, b_v; - b_v = _mm256_rsqrt_ps(a_v); + b_v = _mm256_rsqrt_ps( a_v ); // Note: It is quicker to just call div_ps and sqrt_ps if more // refinement desired! - b.v = _mm256_add_ps( b_v, _mm256_mul_ps( _mm256_set1_ps( 0.5f ), - _mm256_sub_ps( b_v, - _mm256_mul_ps( a_v, - _mm256_mul_ps( b_v, - _mm256_mul_ps( b_v, b_v ) ) ) ) ) ); + b.v = _mm256_add_ps( + b_v, _mm256_mul_ps( + _mm256_set1_ps( 0.5f ), + _mm256_sub_ps( + b_v, _mm256_mul_ps( + a_v, _mm256_mul_ps( + b_v, _mm256_mul_ps( b_v, b_v ) ) ) ) ) ); return b; - } +} - #if 0 +#if 0 inline v8float rsqrt( const v8float &a ) { v8float b; @@ -1740,9 +1746,9 @@ namespace v8 return b; } - #endif +#endif - #if 0 +#if 0 inline v8float rsqrt( const v8float &a ) { v8float b; @@ -1752,18 +1758,18 @@ namespace v8 return b; } - #endif +#endif - inline v8float rcp_approx( const v8float &a ) - { +inline v8float rcp_approx( const v8float& a ) +{ v8float b; b.v = _mm256_rcp_ps( a.v ); return b; - } +} - #if 0 +#if 0 inline v8float rcp( const v8float &a ) { v8float b; @@ -1779,21 +1785,21 @@ namespace v8 return b; } - #endif +#endif - inline v8float rcp( const v8float &a ) - { +inline v8float rcp( const v8float& a ) +{ v8float b; __m256 a_v = a.v, b_v; b_v = _mm256_rcp_ps( a_v ); b.v = _mm256_sub_ps( _mm256_add_ps( b_v, b_v ), - _mm256_mul_ps( a_v, _mm256_mul_ps( b_v, b_v ) ) ); + _mm256_mul_ps( a_v, _mm256_mul_ps( b_v, b_v ) ) ); return b; - } +} - #if 0 +#if 0 inline v8float rcp( const v8float &a ) { v8float b; @@ -1802,76 +1808,76 @@ namespace v8 return b; } - #endif +#endif - inline v8float fma( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fma( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.v = _mm256_fmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v8float fms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.v = _mm256_fmsub_ps( a.v, b.v, c.v ); return d; - } +} - inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fnms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.v = _mm256_fnmadd_ps( a.v, b.v, c.v ); return d; - } +} - inline v8float clear_bits( const v8int &m, const v8float &a ) - { +inline v8float clear_bits( const v8int& m, const v8float& a ) +{ v8float b; b.v = _mm256_andnot_ps( m.v, a.v ); return b; - } +} - inline v8float set_bits( const v8int &m, const v8float &a ) - { +inline v8float set_bits( const v8int& m, const v8float& a ) +{ v8float b; b.v = _mm256_or_ps( m.v, a.v ); return b; - } +} - inline v8float toggle_bits( const v8int &m, const v8float &a ) - { +inline v8float toggle_bits( const v8int& m, const v8float& a ) +{ v8float b; b.v = _mm256_xor_ps( m.v, a.v ); return b; - } +} - inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void increment_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ _mm256_store_ps( p, _mm256_add_ps( _mm256_load_ps( p ), a.v ) ); - } +} - inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void decrement_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ _mm256_store_ps( p, _mm256_sub_ps( _mm256_load_ps( p ), a.v ) ); - } +} - inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void scale_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ _mm256_store_ps( p, _mm256_mul_ps( _mm256_load_ps( p ), a.v ) ); - } +} } // namespace v8 diff --git a/src/util/v8/v8_portable.h b/src/util/v8/v8_portable.h index b8d6b0c8..e0f18ab4 100644 --- a/src/util/v8/v8_portable.h +++ b/src/util/v8/v8_portable.h @@ -11,192 +11,171 @@ #define V8_PORTABLE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v8 { - class v8; - class v8int; - class v8float; +class v8; +class v8int; +class v8float; - //////////////// - // v8 base class +//////////////// +// v8 base class - class v8 - { +class v8 +{ friend class v8int; friend class v8float; // v8 miscellaneous friends - friend inline int any( const v8 &a ) ALWAYS_INLINE; - friend inline int all( const v8 &a ) ALWAYS_INLINE; + friend inline int any( const v8& a ) ALWAYS_INLINE; + friend inline int all( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 splat( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 splat( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 shuffle( const v8& a ) ALWAYS_INLINE; - friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE; - friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE; + friend inline void swap( v8& a, v8& b ) ALWAYS_INLINE; + friend inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, + v8& a5, v8& a6, v8& a7 ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& a, + const v8& b ) ALWAYS_INLINE; // v8 memory manipulation friends - friend inline void load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE; - friend inline void store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void load_8x1( const void* ALIGNED( 16 ) p, + v8& a ) ALWAYS_INLINE; + friend inline void store_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void stream_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void clear_8x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; + friend inline void copy_8x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; + friend inline void swap_8x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v8 transposed memory manipulation friends // Note: Half aligned values are permissible in the 8x2_tr variants. - friend inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) ALWAYS_INLINE; - - friend inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) ALWAYS_INLINE; - - friend inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE; - - friend inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE; - - friend inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE; - - friend inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE; - - friend inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3, - void * ALIGNED(8) a4, - void * ALIGNED(8) a5, - void * ALIGNED(8) a6, - void * ALIGNED(8) a7 ) ALWAYS_INLINE; - - friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x4_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x8_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - const v8 &e, const v8 &f, - const v8 &g, const v8 &h, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; + friend inline void load_8x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + const void* a4, const void* a5, + const void* a6, const void* a7, + v8& a ) ALWAYS_INLINE; + + friend inline void + load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, + v8& a, v8& b ) ALWAYS_INLINE; + + friend inline void + load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c ) ALWAYS_INLINE; + + friend inline void + load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d ) ALWAYS_INLINE; + + friend inline void + load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, + v8& h ) ALWAYS_INLINE; + + friend inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, + void* a3, void* a4, void* a5, void* a6, + void* a7 ) ALWAYS_INLINE; + + friend inline void + store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) ALWAYS_INLINE; + + friend inline void + store_8x3_tr( const v8& a, const v8& b, const v8& c, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, + void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x4_tr( + const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x8_tr( + const v8& a, const v8& b, const v8& c, const v8& d, const v8& e, + const v8& f, const v8& g, const v8& h, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; protected: - - union - { - int i[8]; - float f[8]; + union { + int i[8]; + float f[8]; }; public: + v8() {} // Default constructor - v8() {} // Default constructor - - v8( const v8 &a ) // Copy constructor + v8( const v8& a ) // Copy constructor { - i[0]=a.i[0]; i[1]=a.i[1]; i[2]=a.i[2]; i[3]=a.i[3]; - i[4]=a.i[4]; i[5]=a.i[5]; i[6]=a.i[6]; i[7]=a.i[7]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; } - ~v8() {} // Default destructor - }; + ~v8() {} // Default destructor +}; - // v8 miscellaneous functions +// v8 miscellaneous functions - inline int any( const v8 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3] || - a.i[4] || a.i[5] || a.i[6] || a.i[7]; - } +inline int any( const v8& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7]; +} - inline int all( const v8 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3] && - a.i[4] && a.i[5] && a.i[6] && a.i[7]; - } +inline int all( const v8& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7]; +} - template - inline v8 splat( const v8 & a ) - { +template +inline v8 splat( const v8& a ) +{ v8 b; b.i[0] = a.i[n]; @@ -209,11 +188,11 @@ namespace v8 b.i[7] = a.i[n]; return b; - } +} - template - inline v8 shuffle( const v8 & a ) - { +template +inline v8 shuffle( const v8& a ) +{ v8 b; b.i[0] = a.i[i0]; @@ -226,12 +205,12 @@ namespace v8 b.i[7] = a.i[i7]; return b; - } +} -# define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v8 &a, v8 &b ) - { +inline void swap( v8& a, v8& b ) +{ sw( a.i[0], b.i[0] ); sw( a.i[1], b.i[1] ); sw( a.i[2], b.i[2] ); @@ -240,867 +219,912 @@ namespace v8 sw( a.i[5], b.i[5] ); sw( a.i[6], b.i[6] ); sw( a.i[7], b.i[7] ); - } - - inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) - { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); sw( a0.i[4],a4.i[0] ); sw( a0.i[5],a5.i[0] ); sw( a0.i[6],a6.i[0] ); sw( a0.i[7],a7.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a1.i[4],a4.i[1] ); sw( a1.i[5],a5.i[1] ); sw( a1.i[6],a6.i[1] ); sw( a1.i[7],a7.i[1] ); - sw( a2.i[3],a3.i[2] ); sw( a2.i[4],a4.i[2] ); sw( a2.i[5],a5.i[2] ); sw( a2.i[6],a6.i[2] ); sw( a2.i[7],a7.i[2] ); - sw( a3.i[4],a4.i[3] ); sw( a3.i[5],a5.i[3] ); sw( a3.i[6],a6.i[3] ); sw( a3.i[7],a7.i[3] ); - sw( a4.i[5],a5.i[4] ); sw( a4.i[6],a6.i[4] ); sw( a4.i[7],a7.i[4] ); - sw( a5.i[6],a6.i[5] ); sw( a5.i[7],a7.i[5] ); - sw( a6.i[7],a7.i[6] ); - } - -# undef sw - - // v8 memory manipulation functions - - inline void load_8x1( const void * ALIGNED(16) p, - v8 &a ) - { - a.i[0] = ((const int * ALIGNED(16))p)[0]; - a.i[1] = ((const int * ALIGNED(16))p)[1]; - a.i[2] = ((const int * ALIGNED(16))p)[2]; - a.i[3] = ((const int * ALIGNED(16))p)[3]; - a.i[4] = ((const int * ALIGNED(16))p)[4]; - a.i[5] = ((const int * ALIGNED(16))p)[5]; - a.i[6] = ((const int * ALIGNED(16))p)[6]; - a.i[7] = ((const int * ALIGNED(16))p)[7]; - } - - inline void store_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void stream_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void clear_8x1( void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = 0; - ((int * ALIGNED(16))p)[1] = 0; - ((int * ALIGNED(16))p)[2] = 0; - ((int * ALIGNED(16))p)[3] = 0; - ((int * ALIGNED(16))p)[4] = 0; - ((int * ALIGNED(16))p)[5] = 0; - ((int * ALIGNED(16))p)[6] = 0; - ((int * ALIGNED(16))p)[7] = 0; - } - - // FIXME: Ordering semantics - inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0]; - ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1]; - ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2]; - ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3]; - ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4]; - ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5]; - ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6]; - ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7]; - } - - inline void swap_8x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +} + +inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, v8& a5, v8& a6, + v8& a7 ) +{ + sw( a0.i[1], a1.i[0] ); + sw( a0.i[2], a2.i[0] ); + sw( a0.i[3], a3.i[0] ); + sw( a0.i[4], a4.i[0] ); + sw( a0.i[5], a5.i[0] ); + sw( a0.i[6], a6.i[0] ); + sw( a0.i[7], a7.i[0] ); + sw( a1.i[2], a2.i[1] ); + sw( a1.i[3], a3.i[1] ); + sw( a1.i[4], a4.i[1] ); + sw( a1.i[5], a5.i[1] ); + sw( a1.i[6], a6.i[1] ); + sw( a1.i[7], a7.i[1] ); + sw( a2.i[3], a3.i[2] ); + sw( a2.i[4], a4.i[2] ); + sw( a2.i[5], a5.i[2] ); + sw( a2.i[6], a6.i[2] ); + sw( a2.i[7], a7.i[2] ); + sw( a3.i[4], a4.i[3] ); + sw( a3.i[5], a5.i[3] ); + sw( a3.i[6], a6.i[3] ); + sw( a3.i[7], a7.i[3] ); + sw( a4.i[5], a5.i[4] ); + sw( a4.i[6], a6.i[4] ); + sw( a4.i[7], a7.i[4] ); + sw( a5.i[6], a6.i[5] ); + sw( a5.i[7], a7.i[5] ); + sw( a6.i[7], a7.i[6] ); +} + +#undef sw + +// v8 memory manipulation functions + +inline void load_8x1( const void* ALIGNED( 16 ) p, v8& a ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 16 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 16 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 16 ))p )[3]; + a.i[4] = ( (const int* ALIGNED( 16 ))p )[4]; + a.i[5] = ( (const int* ALIGNED( 16 ))p )[5]; + a.i[6] = ( (const int* ALIGNED( 16 ))p )[6]; + a.i[7] = ( (const int* ALIGNED( 16 ))p )[7]; +} + +inline void store_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void stream_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void clear_8x1( void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = 0; + ( (int* ALIGNED( 16 ))p )[1] = 0; + ( (int* ALIGNED( 16 ))p )[2] = 0; + ( (int* ALIGNED( 16 ))p )[3] = 0; + ( (int* ALIGNED( 16 ))p )[4] = 0; + ( (int* ALIGNED( 16 ))p )[5] = 0; + ( (int* ALIGNED( 16 ))p )[6] = 0; + ( (int* ALIGNED( 16 ))p )[7] = 0; +} + +// FIXME: Ordering semantics +inline void copy_8x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + ( (int* ALIGNED( 16 ))dst )[0] = ( (const int* ALIGNED( 16 ))src )[0]; + ( (int* ALIGNED( 16 ))dst )[1] = ( (const int* ALIGNED( 16 ))src )[1]; + ( (int* ALIGNED( 16 ))dst )[2] = ( (const int* ALIGNED( 16 ))src )[2]; + ( (int* ALIGNED( 16 ))dst )[3] = ( (const int* ALIGNED( 16 ))src )[3]; + ( (int* ALIGNED( 16 ))dst )[4] = ( (const int* ALIGNED( 16 ))src )[4]; + ( (int* ALIGNED( 16 ))dst )[5] = ( (const int* ALIGNED( 16 ))src )[5]; + ( (int* ALIGNED( 16 ))dst )[6] = ( (const int* ALIGNED( 16 ))src )[6]; + ( (int* ALIGNED( 16 ))dst )[7] = ( (const int* ALIGNED( 16 ))src )[7]; +} + +inline void swap_8x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; - t = ((int * ALIGNED(16))a)[0]; - ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; - ((int * ALIGNED(16))b)[0] = t; - - t = ((int * ALIGNED(16))a)[1]; - ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; - ((int * ALIGNED(16))b)[1] = t; - - t = ((int * ALIGNED(16))a)[2]; - ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; - ((int * ALIGNED(16))b)[2] = t; - - t = ((int * ALIGNED(16))a)[3]; - ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; - ((int * ALIGNED(16))b)[3] = t; - - t = ((int * ALIGNED(16))a)[4]; - ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4]; - ((int * ALIGNED(16))b)[4] = t; - - t = ((int * ALIGNED(16))a)[5]; - ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5]; - ((int * ALIGNED(16))b)[5] = t; - - t = ((int * ALIGNED(16))a)[6]; - ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6]; - ((int * ALIGNED(16))b)[6] = t; - - t = ((int * ALIGNED(16))a)[7]; - ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7]; - ((int * ALIGNED(16))b)[7] = t; - } - - // v8 transposed memory manipulation functions - - inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) - { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; - a.i[4] = ((const int *)a4)[0]; - a.i[5] = ((const int *)a5)[0]; - a.i[6] = ((const int *)a6)[0]; - a.i[7] = ((const int *)a7)[0]; - } - - inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) - { - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; - - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; - - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; - - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; - - a.i[4] = ((const int * ALIGNED(8))a4)[0]; - b.i[4] = ((const int * ALIGNED(8))a4)[1]; - - a.i[5] = ((const int * ALIGNED(8))a5)[0]; - b.i[5] = ((const int * ALIGNED(8))a5)[1]; - - a.i[6] = ((const int * ALIGNED(8))a6)[0]; - b.i[6] = ((const int * ALIGNED(8))a6)[1]; - - a.i[7] = ((const int * ALIGNED(8))a7)[0]; - b.i[7] = ((const int * ALIGNED(8))a7)[1]; - } - - inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - } - - inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - d.i[4] = ((const int * ALIGNED(16))a4)[3]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - d.i[5] = ((const int * ALIGNED(16))a5)[3]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - d.i[6] = ((const int * ALIGNED(16))a6)[3]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - d.i[7] = ((const int * ALIGNED(16))a7)[3]; - } - - inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - e.i[0] = ((const int * ALIGNED(16))a0)[4]; - f.i[0] = ((const int * ALIGNED(16))a0)[5]; - g.i[0] = ((const int * ALIGNED(16))a0)[6]; - h.i[0] = ((const int * ALIGNED(16))a0)[7]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - e.i[1] = ((const int * ALIGNED(16))a1)[4]; - f.i[1] = ((const int * ALIGNED(16))a1)[5]; - g.i[1] = ((const int * ALIGNED(16))a1)[6]; - h.i[1] = ((const int * ALIGNED(16))a1)[7]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - e.i[2] = ((const int * ALIGNED(16))a2)[4]; - f.i[2] = ((const int * ALIGNED(16))a2)[5]; - g.i[2] = ((const int * ALIGNED(16))a2)[6]; - h.i[2] = ((const int * ALIGNED(16))a2)[7]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - e.i[3] = ((const int * ALIGNED(16))a3)[4]; - f.i[3] = ((const int * ALIGNED(16))a3)[5]; - g.i[3] = ((const int * ALIGNED(16))a3)[6]; - h.i[3] = ((const int * ALIGNED(16))a3)[7]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - d.i[4] = ((const int * ALIGNED(16))a4)[3]; - e.i[4] = ((const int * ALIGNED(16))a4)[4]; - f.i[4] = ((const int * ALIGNED(16))a4)[5]; - g.i[4] = ((const int * ALIGNED(16))a4)[6]; - h.i[4] = ((const int * ALIGNED(16))a4)[7]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - d.i[5] = ((const int * ALIGNED(16))a5)[3]; - e.i[5] = ((const int * ALIGNED(16))a5)[4]; - f.i[5] = ((const int * ALIGNED(16))a5)[5]; - g.i[5] = ((const int * ALIGNED(16))a5)[6]; - h.i[5] = ((const int * ALIGNED(16))a5)[7]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - d.i[6] = ((const int * ALIGNED(16))a6)[3]; - e.i[6] = ((const int * ALIGNED(16))a6)[4]; - f.i[6] = ((const int * ALIGNED(16))a6)[5]; - g.i[6] = ((const int * ALIGNED(16))a6)[6]; - h.i[6] = ((const int * ALIGNED(16))a6)[7]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - d.i[7] = ((const int * ALIGNED(16))a7)[3]; - e.i[7] = ((const int * ALIGNED(16))a7)[4]; - f.i[7] = ((const int * ALIGNED(16))a7)[5]; - g.i[7] = ((const int * ALIGNED(16))a7)[6]; - h.i[7] = ((const int * ALIGNED(16))a7)[7]; - } - - inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) - { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; - ((int *)a4)[0] = a.i[4]; - ((int *)a5)[0] = a.i[5]; - ((int *)a6)[0] = a.i[6]; - ((int *)a7)[0] = a.i[7]; - } - - inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3, - void * ALIGNED(8) a4, void * ALIGNED(8) a5, - void * ALIGNED(8) a6, void * ALIGNED(8) a7 ) - { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; - - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; - - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; - - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; - - ((int * ALIGNED(8))a4)[0] = a.i[4]; - ((int * ALIGNED(8))a4)[1] = b.i[4]; - - ((int * ALIGNED(8))a5)[0] = a.i[5]; - ((int * ALIGNED(8))a5)[1] = b.i[5]; - - ((int * ALIGNED(8))a6)[0] = a.i[6]; - ((int * ALIGNED(8))a6)[1] = b.i[6]; - - ((int * ALIGNED(8))a7)[0] = a.i[7]; - ((int * ALIGNED(8))a7)[1] = b.i[7]; - } - - inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - } - - inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - ((int * ALIGNED(16))a4)[3] = d.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - ((int * ALIGNED(16))a5)[3] = d.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - ((int * ALIGNED(16))a6)[3] = d.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - ((int * ALIGNED(16))a7)[3] = d.i[7]; - } - - inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - const v8 &e, const v8 &f, const v8 &g, const v8 &h, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - ((int * ALIGNED(16))a0)[4] = e.i[0]; - ((int * ALIGNED(16))a0)[5] = f.i[0]; - ((int * ALIGNED(16))a0)[6] = g.i[0]; - ((int * ALIGNED(16))a0)[7] = h.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - ((int * ALIGNED(16))a1)[4] = e.i[1]; - ((int * ALIGNED(16))a1)[5] = f.i[1]; - ((int * ALIGNED(16))a1)[6] = g.i[1]; - ((int * ALIGNED(16))a1)[7] = h.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - ((int * ALIGNED(16))a2)[4] = e.i[2]; - ((int * ALIGNED(16))a2)[5] = f.i[2]; - ((int * ALIGNED(16))a2)[6] = g.i[2]; - ((int * ALIGNED(16))a2)[7] = h.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; - ((int * ALIGNED(16))a3)[4] = e.i[3]; - ((int * ALIGNED(16))a3)[5] = f.i[3]; - ((int * ALIGNED(16))a3)[6] = g.i[3]; - ((int * ALIGNED(16))a3)[7] = h.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - ((int * ALIGNED(16))a4)[3] = d.i[4]; - ((int * ALIGNED(16))a4)[4] = e.i[4]; - ((int * ALIGNED(16))a4)[5] = f.i[4]; - ((int * ALIGNED(16))a4)[6] = g.i[4]; - ((int * ALIGNED(16))a4)[7] = h.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - ((int * ALIGNED(16))a5)[3] = d.i[5]; - ((int * ALIGNED(16))a5)[4] = e.i[5]; - ((int * ALIGNED(16))a5)[5] = f.i[5]; - ((int * ALIGNED(16))a5)[6] = g.i[5]; - ((int * ALIGNED(16))a5)[7] = h.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - ((int * ALIGNED(16))a6)[3] = d.i[6]; - ((int * ALIGNED(16))a6)[4] = e.i[6]; - ((int * ALIGNED(16))a6)[5] = f.i[6]; - ((int * ALIGNED(16))a6)[6] = g.i[6]; - ((int * ALIGNED(16))a6)[7] = h.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - ((int * ALIGNED(16))a7)[3] = d.i[7]; - ((int * ALIGNED(16))a7)[4] = e.i[7]; - ((int * ALIGNED(16))a7)[5] = f.i[7]; - ((int * ALIGNED(16))a7)[6] = g.i[7]; - ((int * ALIGNED(16))a7)[7] = h.i[7]; - } - - ////////////// - // v8int class - - class v8int : public v8 - { + t = ( (int* ALIGNED( 16 ))a )[0]; + ( (int* ALIGNED( 16 ))a )[0] = ( (int* ALIGNED( 16 ))b )[0]; + ( (int* ALIGNED( 16 ))b )[0] = t; + + t = ( (int* ALIGNED( 16 ))a )[1]; + ( (int* ALIGNED( 16 ))a )[1] = ( (int* ALIGNED( 16 ))b )[1]; + ( (int* ALIGNED( 16 ))b )[1] = t; + + t = ( (int* ALIGNED( 16 ))a )[2]; + ( (int* ALIGNED( 16 ))a )[2] = ( (int* ALIGNED( 16 ))b )[2]; + ( (int* ALIGNED( 16 ))b )[2] = t; + + t = ( (int* ALIGNED( 16 ))a )[3]; + ( (int* ALIGNED( 16 ))a )[3] = ( (int* ALIGNED( 16 ))b )[3]; + ( (int* ALIGNED( 16 ))b )[3] = t; + + t = ( (int* ALIGNED( 16 ))a )[4]; + ( (int* ALIGNED( 16 ))a )[4] = ( (int* ALIGNED( 16 ))b )[4]; + ( (int* ALIGNED( 16 ))b )[4] = t; + + t = ( (int* ALIGNED( 16 ))a )[5]; + ( (int* ALIGNED( 16 ))a )[5] = ( (int* ALIGNED( 16 ))b )[5]; + ( (int* ALIGNED( 16 ))b )[5] = t; + + t = ( (int* ALIGNED( 16 ))a )[6]; + ( (int* ALIGNED( 16 ))a )[6] = ( (int* ALIGNED( 16 ))b )[6]; + ( (int* ALIGNED( 16 ))b )[6] = t; + + t = ( (int* ALIGNED( 16 ))a )[7]; + ( (int* ALIGNED( 16 ))a )[7] = ( (int* ALIGNED( 16 ))b )[7]; + ( (int* ALIGNED( 16 ))b )[7] = t; +} + +// v8 transposed memory manipulation functions + +inline void load_8x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, const void* a4, const void* a5, + const void* a6, const void* a7, v8& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; + a.i[4] = ( (const int*)a4 )[0]; + a.i[5] = ( (const int*)a5 )[0]; + a.i[6] = ( (const int*)a6 )[0]; + a.i[7] = ( (const int*)a7 )[0]; +} + +inline void +load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, v8& a, + v8& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a0 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a1 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a2 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a3 )[1]; + + a.i[4] = ( (const int* ALIGNED( 8 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 8 ))a4 )[1]; + + a.i[5] = ( (const int* ALIGNED( 8 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 8 ))a5 )[1]; + + a.i[6] = ( (const int* ALIGNED( 8 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 8 ))a6 )[1]; + + a.i[7] = ( (const int* ALIGNED( 8 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 8 ))a7 )[1]; +} + +inline void +load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; +} + +inline void +load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + d.i[4] = ( (const int* ALIGNED( 16 ))a4 )[3]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + d.i[5] = ( (const int* ALIGNED( 16 ))a5 )[3]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + d.i[6] = ( (const int* ALIGNED( 16 ))a6 )[3]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; + d.i[7] = ( (const int* ALIGNED( 16 ))a7 )[3]; +} + +inline void +load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, v8& h ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + e.i[0] = ( (const int* ALIGNED( 16 ))a0 )[4]; + f.i[0] = ( (const int* ALIGNED( 16 ))a0 )[5]; + g.i[0] = ( (const int* ALIGNED( 16 ))a0 )[6]; + h.i[0] = ( (const int* ALIGNED( 16 ))a0 )[7]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + e.i[1] = ( (const int* ALIGNED( 16 ))a1 )[4]; + f.i[1] = ( (const int* ALIGNED( 16 ))a1 )[5]; + g.i[1] = ( (const int* ALIGNED( 16 ))a1 )[6]; + h.i[1] = ( (const int* ALIGNED( 16 ))a1 )[7]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + e.i[2] = ( (const int* ALIGNED( 16 ))a2 )[4]; + f.i[2] = ( (const int* ALIGNED( 16 ))a2 )[5]; + g.i[2] = ( (const int* ALIGNED( 16 ))a2 )[6]; + h.i[2] = ( (const int* ALIGNED( 16 ))a2 )[7]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; + e.i[3] = ( (const int* ALIGNED( 16 ))a3 )[4]; + f.i[3] = ( (const int* ALIGNED( 16 ))a3 )[5]; + g.i[3] = ( (const int* ALIGNED( 16 ))a3 )[6]; + h.i[3] = ( (const int* ALIGNED( 16 ))a3 )[7]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + d.i[4] = ( (const int* ALIGNED( 16 ))a4 )[3]; + e.i[4] = ( (const int* ALIGNED( 16 ))a4 )[4]; + f.i[4] = ( (const int* ALIGNED( 16 ))a4 )[5]; + g.i[4] = ( (const int* ALIGNED( 16 ))a4 )[6]; + h.i[4] = ( (const int* ALIGNED( 16 ))a4 )[7]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + d.i[5] = ( (const int* ALIGNED( 16 ))a5 )[3]; + e.i[5] = ( (const int* ALIGNED( 16 ))a5 )[4]; + f.i[5] = ( (const int* ALIGNED( 16 ))a5 )[5]; + g.i[5] = ( (const int* ALIGNED( 16 ))a5 )[6]; + h.i[5] = ( (const int* ALIGNED( 16 ))a5 )[7]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + d.i[6] = ( (const int* ALIGNED( 16 ))a6 )[3]; + e.i[6] = ( (const int* ALIGNED( 16 ))a6 )[4]; + f.i[6] = ( (const int* ALIGNED( 16 ))a6 )[5]; + g.i[6] = ( (const int* ALIGNED( 16 ))a6 )[6]; + h.i[6] = ( (const int* ALIGNED( 16 ))a6 )[7]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; + d.i[7] = ( (const int* ALIGNED( 16 ))a7 )[3]; + e.i[7] = ( (const int* ALIGNED( 16 ))a7 )[4]; + f.i[7] = ( (const int* ALIGNED( 16 ))a7 )[5]; + g.i[7] = ( (const int* ALIGNED( 16 ))a7 )[6]; + h.i[7] = ( (const int* ALIGNED( 16 ))a7 )[7]; +} + +inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, void* a3, + void* a4, void* a5, void* a6, void* a7 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; + ( (int*)a4 )[0] = a.i[4]; + ( (int*)a5 )[0] = a.i[5]; + ( (int*)a6 )[0] = a.i[6]; + ( (int*)a7 )[0] = a.i[7]; +} + +inline void store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) +{ + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; + + ( (int* ALIGNED( 8 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a4 )[1] = b.i[4]; + + ( (int* ALIGNED( 8 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a5 )[1] = b.i[5]; + + ( (int* ALIGNED( 8 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a6 )[1] = b.i[6]; + + ( (int* ALIGNED( 8 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a7 )[1] = b.i[7]; +} + +inline void store_8x3_tr( const v8& a, const v8& b, const v8& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; +} + +inline void store_8x4_tr( const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + ( (int* ALIGNED( 16 ))a4 )[3] = d.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + ( (int* ALIGNED( 16 ))a5 )[3] = d.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + ( (int* ALIGNED( 16 ))a6 )[3] = d.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; + ( (int* ALIGNED( 16 ))a7 )[3] = d.i[7]; +} + +inline void store_8x8_tr( const v8& a, const v8& b, const v8& c, const v8& d, + const v8& e, const v8& f, const v8& g, const v8& h, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + ( (int* ALIGNED( 16 ))a0 )[4] = e.i[0]; + ( (int* ALIGNED( 16 ))a0 )[5] = f.i[0]; + ( (int* ALIGNED( 16 ))a0 )[6] = g.i[0]; + ( (int* ALIGNED( 16 ))a0 )[7] = h.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + ( (int* ALIGNED( 16 ))a1 )[4] = e.i[1]; + ( (int* ALIGNED( 16 ))a1 )[5] = f.i[1]; + ( (int* ALIGNED( 16 ))a1 )[6] = g.i[1]; + ( (int* ALIGNED( 16 ))a1 )[7] = h.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + ( (int* ALIGNED( 16 ))a2 )[4] = e.i[2]; + ( (int* ALIGNED( 16 ))a2 )[5] = f.i[2]; + ( (int* ALIGNED( 16 ))a2 )[6] = g.i[2]; + ( (int* ALIGNED( 16 ))a2 )[7] = h.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; + ( (int* ALIGNED( 16 ))a3 )[4] = e.i[3]; + ( (int* ALIGNED( 16 ))a3 )[5] = f.i[3]; + ( (int* ALIGNED( 16 ))a3 )[6] = g.i[3]; + ( (int* ALIGNED( 16 ))a3 )[7] = h.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + ( (int* ALIGNED( 16 ))a4 )[3] = d.i[4]; + ( (int* ALIGNED( 16 ))a4 )[4] = e.i[4]; + ( (int* ALIGNED( 16 ))a4 )[5] = f.i[4]; + ( (int* ALIGNED( 16 ))a4 )[6] = g.i[4]; + ( (int* ALIGNED( 16 ))a4 )[7] = h.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + ( (int* ALIGNED( 16 ))a5 )[3] = d.i[5]; + ( (int* ALIGNED( 16 ))a5 )[4] = e.i[5]; + ( (int* ALIGNED( 16 ))a5 )[5] = f.i[5]; + ( (int* ALIGNED( 16 ))a5 )[6] = g.i[5]; + ( (int* ALIGNED( 16 ))a5 )[7] = h.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + ( (int* ALIGNED( 16 ))a6 )[3] = d.i[6]; + ( (int* ALIGNED( 16 ))a6 )[4] = e.i[6]; + ( (int* ALIGNED( 16 ))a6 )[5] = f.i[6]; + ( (int* ALIGNED( 16 ))a6 )[6] = g.i[6]; + ( (int* ALIGNED( 16 ))a6 )[7] = h.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; + ( (int* ALIGNED( 16 ))a7 )[3] = d.i[7]; + ( (int* ALIGNED( 16 ))a7 )[4] = e.i[7]; + ( (int* ALIGNED( 16 ))a7 )[5] = f.i[7]; + ( (int* ALIGNED( 16 ))a7 )[6] = g.i[7]; + ( (int* ALIGNED( 16 ))a7 )[7] = h.i[7]; +} + +////////////// +// v8int class + +class v8int : public v8 +{ // v8int prefix unary operator friends - friend inline v8int operator +( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator ~( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8int & a ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator~( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8int prefix increment / decrement operator friends - friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a ) ALWAYS_INLINE; // v8int postfix increment / decrement operator friends - friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a, int ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a, int ) ALWAYS_INLINE; // v8int binary operator friends - friend inline v8int operator +( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator *( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator /( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator %( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ^( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator |( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator*(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator/( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator%( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator^( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator|( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int logical operator friends - friend inline v8int operator <( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8int abs( const v8int &a ) ALWAYS_INLINE; - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; + friend inline v8int abs( const v8int& a ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& t, + const v8& f ) ALWAYS_INLINE; // v8float unary operator friends - friend inline v8int operator !( const v8float & a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float miscellaneous friends - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; public: - // v8int constructors / destructors - v8int() {} // Default constructor + v8int() {} // Default constructor - v8int( const v8int &a ) // Copy constructor + v8int( const v8int& a ) // Copy constructor { - i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3]; - i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; } - v8int( const v8 &a ) // Init from mixed + v8int( const v8& a ) // Init from mixed { - i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3]; - i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; } - v8int( int a ) // Init from scalar + v8int( int a ) // Init from scalar { - i[0] = a; i[1] = a; i[2] = a; i[3] = a; - i[4] = a; i[5] = a; i[6] = a; i[7] = a; + i[0] = a; + i[1] = a; + i[2] = a; + i[3] = a; + i[4] = a; + i[5] = a; + i[6] = a; + i[7] = a; } - v8int( int i0, int i1, int i2, int i3, - int i4, int i5, int i6, int i7 ) // Init from scalars + v8int( int i0, int i1, int i2, int i3, int i4, int i5, int i6, + int i7 ) // Init from scalars { - i[0] = i0; i[1] = i1; i[2] = i2; i[3] = i3; - i[4] = i4; i[5] = i5; i[6] = i6; i[7] = i7; + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; + i[4] = i4; + i[5] = i5; + i[6] = i6; + i[7] = i7; } - ~v8int() {} // Destructor + ~v8int() {} // Destructor // v8int assignment operators -# define ASSIGN(op) \ - inline v8int &operator op( const v8int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - i[4] op b.i[4]; \ - i[5] op b.i[5]; \ - i[6] op b.i[6]; \ - i[7] op b.i[7]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8int& operator op( const v8int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + i[4] op b.i[4]; \ + i[5] op b.i[5]; \ + i[6] op b.i[6]; \ + i[7] op b.i[7]; \ + return *this; \ } - ASSIGN( =) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v8int member access operator - inline int &operator []( int n ) - { - return i[n]; + inline int& operator[]( int n ) { return i[n]; } + + inline int operator()( int n ) { return i[n]; } +}; + +// v8int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v8int operator op( const v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ } - inline int operator ()( int n ) - { - return i[n]; - } - }; - - // v8int prefix unary operators - -# define PREFIX_UNARY(op) \ - inline v8int operator op( const v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } - - PREFIX_UNARY(+) - PREFIX_UNARY(-) - - inline v8int operator !( const v8int & a ) - { +PREFIX_UNARY( +) +PREFIX_UNARY( -) + +inline v8int operator!( const v8int& a ) +{ v8int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); - b.i[4] = - ( !a.i[4] ); - b.i[5] = - ( !a.i[5] ); - b.i[6] = - ( !a.i[6] ); - b.i[7] = - ( !a.i[7] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); + b.i[4] = -( !a.i[4] ); + b.i[5] = -( !a.i[5] ); + b.i[6] = -( !a.i[6] ); + b.i[7] = -( !a.i[7] ); return b; - } - - PREFIX_UNARY(~) - -# undef PREFIX_UNARY - - // v8int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v8int operator op( v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v8int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v8int operator op( v8int & a, int ) \ - { \ - v8int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - b.i[4] = ( a.i[4] op ); \ - b.i[5] = ( a.i[5] op ); \ - b.i[6] = ( a.i[6] op ); \ - b.i[7] = ( a.i[7] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v8int binary operators - -# define BINARY(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - c.i[4] = a.i[4] op b.i[4]; \ - c.i[5] = a.i[5] op b.i[5]; \ - c.i[6] = a.i[6] op b.i[6]; \ - c.i[7] = a.i[7] op b.i[7]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v8int logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - c.i[4] = - ( a.i[4] op b.i[4] ); \ - c.i[5] = - ( a.i[5] op b.i[5] ); \ - c.i[6] = - ( a.i[6] op b.i[6] ); \ - c.i[7] = - ( a.i[7] op b.i[7] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v8int miscellaneous functions - - inline v8int abs( const v8int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v8int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v8int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a, int ) \ + { \ + v8int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + b.i[4] = ( a.i[4] op ); \ + b.i[5] = ( a.i[5] op ); \ + b.i[6] = ( a.i[6] op ); \ + b.i[7] = ( a.i[7] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v8int binary operators + +#define BINARY( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + c.i[4] = a.i[4] op b.i[4]; \ + c.i[5] = a.i[5] op b.i[5]; \ + c.i[6] = a.i[6] op b.i[6]; \ + c.i[7] = a.i[7] op b.i[7]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( ^) +BINARY( & ) +BINARY( | ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v8int logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + c.i[4] = -( a.i[4] op b.i[4] ); \ + c.i[5] = -( a.i[5] op b.i[5] ); \ + c.i[6] = -( a.i[6] op b.i[6] ); \ + c.i[7] = -( a.i[7] op b.i[7] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v8int miscellaneous functions + +inline v8int abs( const v8int& a ) +{ v8int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -1113,10 +1137,10 @@ namespace v8 b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7]; return b; - } +} - inline v8 czero( const v8int &c, const v8 &a ) - { +inline v8 czero( const v8int& c, const v8& a ) +{ v8 b; b.i[0] = a.i[0] & ~c.i[0]; @@ -1129,10 +1153,10 @@ namespace v8 b.i[7] = a.i[7] & ~c.i[7]; return b; - } +} - inline v8 notczero( const v8int &c, const v8 &a ) - { +inline v8 notczero( const v8int& c, const v8& a ) +{ v8 b; b.i[0] = a.i[0] & c.i[0]; @@ -1145,10 +1169,10 @@ namespace v8 b.i[7] = a.i[7] & c.i[7]; return b; - } +} - inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) - { +inline v8 merge( const v8int& c, const v8& t, const v8& f ) +{ v8 m; m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); @@ -1161,180 +1185,209 @@ namespace v8 m.i[7] = ( f.i[7] & ~c.i[7] ) | ( t.i[7] & c.i[7] ); return m; - } +} - //////////////// - // v8float class +//////////////// +// v8float class - class v8float : public v8 - { +class v8float : public v8 +{ // v8float prefix unary operator friends - friend inline v8float operator +( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator ~( const v8float &a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8float &a ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator~( const v8float& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8float prefix increment / decrement operator friends - friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a ) ALWAYS_INLINE; // v8float postfix increment / decrement operator friends - friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a, int ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a, int ) ALWAYS_INLINE; // v8float binary operator friends - friend inline v8float operator +( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator *( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator /( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator*(const v8float& a, + const v8float& b)ALWAYS_INLINE; + friend inline v8float operator/( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float math library friends -# define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v8float fn( const v8float &a, \ - const v8float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v8float fn( const v8float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v8float fn( const v8float& a, const v8float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v8float miscellaneous friends - friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rsqrt ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; + friend inline v8float rsqrt_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rsqrt( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp( const v8float& a ) ALWAYS_INLINE; + friend inline v8float fma( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fnms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline void increment_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void decrement_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void scale_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; public: - // v8float constructors / destructors - v8float() {} // Default constructor + v8float() {} // Default constructor - v8float( const v8float &a ) // Copy constructor + v8float( const v8float& a ) // Copy constructor { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; - f[4] = a.f[4]; - f[5] = a.f[5]; - f[6] = a.f[6]; - f[7] = a.f[7]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; } - v8float( const v8 &a ) // Init from mixed + v8float( const v8& a ) // Init from mixed { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; - f[4] = a.f[4]; - f[5] = a.f[5]; - f[6] = a.f[6]; - f[7] = a.f[7]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; } - v8float( float a ) // Init from scalar + v8float( float a ) // Init from scalar { - f[0] = a; - f[1] = a; - f[2] = a; - f[3] = a; - f[4] = a; - f[5] = a; - f[6] = a; - f[7] = a; + f[0] = a; + f[1] = a; + f[2] = a; + f[3] = a; + f[4] = a; + f[5] = a; + f[6] = a; + f[7] = a; } - v8float( float f0, float f1, float f2, float f3, - float f4, float f5, float f6, float f7 ) // Init from scalars + v8float( float f0, float f1, float f2, float f3, float f4, float f5, + float f6, float f7 ) // Init from scalars { - f[0] = f0; - f[1] = f1; - f[2] = f2; - f[3] = f3; - f[4] = f4; - f[5] = f5; - f[6] = f6; - f[7] = f7; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; + f[4] = f4; + f[5] = f5; + f[6] = f6; + f[7] = f7; } - ~v8float() {} // Destructor + ~v8float() {} // Destructor // v8float assignment operators -# define ASSIGN(op) \ - inline v8float &operator op( const v8float &b ) \ - { \ - f[0] op b.f[0]; \ - f[1] op b.f[1]; \ - f[2] op b.f[2]; \ - f[3] op b.f[3]; \ - f[4] op b.f[4]; \ - f[5] op b.f[5]; \ - f[6] op b.f[6]; \ - f[7] op b.f[7]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8float& operator op( const v8float& b ) \ + { \ + f[0] op b.f[0]; \ + f[1] op b.f[1]; \ + f[2] op b.f[2]; \ + f[3] op b.f[3]; \ + f[4] op b.f[4]; \ + f[5] op b.f[5]; \ + f[6] op b.f[6]; \ + f[7] op b.f[7]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) -# undef ASSIGN +#undef ASSIGN // v8float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v8float prefix unary operators +// v8float prefix unary operators - inline v8float operator +( const v8float &a ) - { +inline v8float operator+( const v8float& a ) +{ v8float b; b.f[0] = +a.f[0]; @@ -1347,10 +1400,10 @@ namespace v8 b.f[7] = +a.f[7]; return b; - } +} - inline v8float operator -( const v8float &a ) - { +inline v8float operator-( const v8float& a ) +{ v8float b; b.f[0] = -a.f[0]; @@ -1363,10 +1416,10 @@ namespace v8 b.f[7] = -a.f[7]; return b; - } +} - inline v8int operator !( const v8float &a ) - { +inline v8int operator!( const v8float& a ) +{ v8int b; b.i[0] = a.i[0] ? 0 : -1; @@ -1379,12 +1432,12 @@ namespace v8 b.i[7] = a.i[7] ? 0 : -1; return b; - } +} - // v8float prefix increment / decrement operators +// v8float prefix increment / decrement operators - inline v8float operator ++( v8float &a ) - { +inline v8float operator++( v8float& a ) +{ v8float b; b.f[0] = ++a.f[0]; @@ -1397,10 +1450,10 @@ namespace v8 b.f[7] = ++a.f[7]; return b; - } +} - inline v8float operator --( v8float &a ) - { +inline v8float operator--( v8float& a ) +{ v8float b; b.f[0] = --a.f[0]; @@ -1413,12 +1466,12 @@ namespace v8 b.f[7] = --a.f[7]; return b; - } +} - // v8float postfix increment / decrement operators +// v8float postfix increment / decrement operators - inline v8float operator ++( v8float &a, int ) - { +inline v8float operator++( v8float& a, int ) +{ v8float b; b.f[0] = a.f[0]++; @@ -1431,10 +1484,10 @@ namespace v8 b.f[7] = a.f[7]++; return b; - } +} - inline v8float operator --( v8float &a, int ) - { +inline v8float operator--( v8float& a, int ) +{ v8float b; b.f[0] = a.f[0]--; @@ -1447,145 +1500,155 @@ namespace v8 b.f[7] = a.f[7]--; return b; - } - - // v8float binary operators - -# define BINARY(op) \ - inline v8float operator op( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.f[0] = a.f[0] op b.f[0]; \ - c.f[1] = a.f[1] op b.f[1]; \ - c.f[2] = a.f[2] op b.f[2]; \ - c.f[3] = a.f[3] op b.f[3]; \ - c.f[4] = a.f[4] op b.f[4]; \ - c.f[5] = a.f[5] op b.f[5]; \ - c.f[6] = a.f[6] op b.f[6]; \ - c.f[7] = a.f[7] op b.f[7]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - -# undef BINARY - - // v8float logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8float &a, const v8float &b ) \ - { \ - v8int c; \ - c.i[0] = -( a.f[0] op b.f[0] ); \ - c.i[1] = -( a.f[1] op b.f[1] ); \ - c.i[2] = -( a.f[2] op b.f[2] ); \ - c.i[3] = -( a.f[3] op b.f[3] ); \ - c.i[4] = -( a.f[4] op b.f[4] ); \ - c.i[5] = -( a.f[5] op b.f[5] ); \ - c.i[6] = -( a.f[6] op b.f[6] ); \ - c.i[7] = -( a.f[7] op b.f[7] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v8float math library functions - -# define CMATH_FR1(fn) \ - inline v8float fn( const v8float &a ) \ - { \ - v8float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - b.f[4] = ::fn( a.f[4] ); \ - b.f[5] = ::fn( a.f[5] ); \ - b.f[6] = ::fn( a.f[6] ); \ - b.f[7] = ::fn( a.f[7] ); \ - return b; \ - } - -# define CMATH_FR2(fn) \ - inline v8float fn( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - c.f[4] = ::fn( a.f[4], b.f[4] ); \ - c.f[5] = ::fn( a.f[5], b.f[5] ); \ - c.f[6] = ::fn( a.f[6], b.f[6] ); \ - c.f[7] = ::fn( a.f[7], b.f[7] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - inline v8float copysign( const v8float &a, const v8float &b ) - { +} + +// v8float binary operators + +#define BINARY( op ) \ + inline v8float operator op( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.f[0] = a.f[0] op b.f[0]; \ + c.f[1] = a.f[1] op b.f[1]; \ + c.f[2] = a.f[2] op b.f[2]; \ + c.f[3] = a.f[3] op b.f[3]; \ + c.f[4] = a.f[4] op b.f[4]; \ + c.f[5] = a.f[5] op b.f[5]; \ + c.f[6] = a.f[6] op b.f[6]; \ + c.f[7] = a.f[7] op b.f[7]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v8float logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8float& a, const v8float& b ) \ + { \ + v8int c; \ + c.i[0] = -( a.f[0] op b.f[0] ); \ + c.i[1] = -( a.f[1] op b.f[1] ); \ + c.i[2] = -( a.f[2] op b.f[2] ); \ + c.i[3] = -( a.f[3] op b.f[3] ); \ + c.i[4] = -( a.f[4] op b.f[4] ); \ + c.i[5] = -( a.f[5] op b.f[5] ); \ + c.i[6] = -( a.f[6] op b.f[6] ); \ + c.i[7] = -( a.f[7] op b.f[7] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v8float math library functions + +#define CMATH_FR1( fn ) \ + inline v8float fn( const v8float& a ) \ + { \ + v8float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + b.f[4] = ::fn( a.f[4] ); \ + b.f[5] = ::fn( a.f[5] ); \ + b.f[6] = ::fn( a.f[6] ); \ + b.f[7] = ::fn( a.f[7] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v8float fn( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + c.f[4] = ::fn( a.f[4], b.f[4] ); \ + c.f[5] = ::fn( a.f[5], b.f[5] ); \ + c.f[6] = ::fn( a.f[6], b.f[6] ); \ + c.f[7] = ::fn( a.f[7], b.f[7] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + + inline v8float + copysign( const v8float& a, const v8float& b ) +{ v8float c; float t; t = ::fabs( a.f[0] ); - if( b.f[0] < 0 ) t = -t; + if ( b.f[0] < 0 ) + t = -t; c.f[0] = t; t = ::fabs( a.f[1] ); - if( b.f[1] < 0 ) t = -t; + if ( b.f[1] < 0 ) + t = -t; c.f[1] = t; t = ::fabs( a.f[2] ); - if( b.f[2] < 0 ) t = -t; + if ( b.f[2] < 0 ) + t = -t; c.f[2] = t; t = ::fabs( a.f[3] ); - if( b.f[3] < 0 ) t = -t; + if ( b.f[3] < 0 ) + t = -t; c.f[3] = t; t = ::fabs( a.f[4] ); - if( b.f[4] < 0 ) t = -t; + if ( b.f[4] < 0 ) + t = -t; c.f[4] = t; t = ::fabs( a.f[5] ); - if( b.f[5] < 0 ) t = -t; + if ( b.f[5] < 0 ) + t = -t; c.f[5] = t; t = ::fabs( a.f[6] ); - if( b.f[6] < 0 ) t = -t; + if ( b.f[6] < 0 ) + t = -t; c.f[6] = t; t = ::fabs( a.f[7] ); - if( b.f[7] < 0 ) t = -t; + if ( b.f[7] < 0 ) + t = -t; c.f[7] = t; return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v8float miscellaneous functions +// v8float miscellaneous functions - inline v8float rsqrt_approx( const v8float &a ) - { +inline v8float rsqrt_approx( const v8float& a ) +{ v8float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1598,10 +1661,10 @@ namespace v8 b.f[7] = ::sqrt( 1.0f / a.f[7] ); return b; - } +} - inline v8float rsqrt( const v8float &a ) - { +inline v8float rsqrt( const v8float& a ) +{ v8float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1614,10 +1677,10 @@ namespace v8 b.f[7] = ::sqrt( 1.0f / a.f[7] ); return b; - } +} - inline v8float rcp_approx( const v8float &a ) - { +inline v8float rcp_approx( const v8float& a ) +{ v8float b; b.f[0] = 1.0f / a.f[0]; @@ -1630,10 +1693,10 @@ namespace v8 b.f[7] = 1.0f / a.f[7]; return b; - } +} - inline v8float rcp( const v8float &a ) - { +inline v8float rcp( const v8float& a ) +{ v8float b; b.f[0] = 1.0f / a.f[0]; @@ -1646,10 +1709,10 @@ namespace v8 b.f[7] = 1.0f / a.f[7]; return b; - } +} - inline v8float fma( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fma( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.f[0] = a.f[0] * b.f[0] + c.f[0]; @@ -1662,10 +1725,10 @@ namespace v8 d.f[7] = a.f[7] * b.f[7] + c.f[7]; return d; - } +} - inline v8float fms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.f[0] = a.f[0] * b.f[0] - c.f[0]; @@ -1678,10 +1741,10 @@ namespace v8 d.f[7] = a.f[7] * b.f[7] - c.f[7]; return d; - } +} - inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fnms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.f[0] = c.f[0] - a.f[0] * b.f[0]; @@ -1694,10 +1757,10 @@ namespace v8 d.f[7] = c.f[7] - a.f[7] * b.f[7]; return d; - } +} - inline v8float clear_bits( const v8int &m, const v8float &a ) - { +inline v8float clear_bits( const v8int& m, const v8float& a ) +{ v8float b; b.i[0] = ( ~m.i[0] ) & a.i[0]; @@ -1710,10 +1773,10 @@ namespace v8 b.i[7] = ( ~m.i[7] ) & a.i[7]; return b; - } +} - inline v8float set_bits( const v8int &m, const v8float &a ) - { +inline v8float set_bits( const v8int& m, const v8float& a ) +{ v8float b; b.i[0] = m.i[0] | a.i[0]; @@ -1726,10 +1789,10 @@ namespace v8 b.i[7] = m.i[7] | a.i[7]; return b; - } +} - inline v8float toggle_bits( const v8int &m, const v8float &a ) - { +inline v8float toggle_bits( const v8int& m, const v8float& a ) +{ v8float b; b.i[0] = m.i[0] ^ a.i[0]; @@ -1742,10 +1805,10 @@ namespace v8 b.i[7] = m.i[7] ^ a.i[7]; return b; - } +} - inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void increment_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ p[0] += a.f[0]; p[1] += a.f[1]; p[2] += a.f[2]; @@ -1754,10 +1817,10 @@ namespace v8 p[5] += a.f[5]; p[6] += a.f[6]; p[7] += a.f[7]; - } +} - inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void decrement_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ p[0] -= a.f[0]; p[1] -= a.f[1]; p[2] -= a.f[2]; @@ -1766,10 +1829,10 @@ namespace v8 p[5] -= a.f[5]; p[6] -= a.f[6]; p[7] -= a.f[7]; - } +} - inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void scale_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ p[0] *= a.f[0]; p[1] *= a.f[1]; p[2] *= a.f[2]; @@ -1778,7 +1841,7 @@ namespace v8 p[5] *= a.f[5]; p[6] *= a.f[6]; p[7] *= a.f[7]; - } +} } // namespace v8 diff --git a/src/util/v8/v8_portable_v0.h b/src/util/v8/v8_portable_v0.h index b8d6b0c8..e0f18ab4 100644 --- a/src/util/v8/v8_portable_v0.h +++ b/src/util/v8/v8_portable_v0.h @@ -11,192 +11,171 @@ #define V8_PORTABLE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v8 { - class v8; - class v8int; - class v8float; +class v8; +class v8int; +class v8float; - //////////////// - // v8 base class +//////////////// +// v8 base class - class v8 - { +class v8 +{ friend class v8int; friend class v8float; // v8 miscellaneous friends - friend inline int any( const v8 &a ) ALWAYS_INLINE; - friend inline int all( const v8 &a ) ALWAYS_INLINE; + friend inline int any( const v8& a ) ALWAYS_INLINE; + friend inline int all( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 splat( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 splat( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 shuffle( const v8& a ) ALWAYS_INLINE; - friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE; - friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE; + friend inline void swap( v8& a, v8& b ) ALWAYS_INLINE; + friend inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, + v8& a5, v8& a6, v8& a7 ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& a, + const v8& b ) ALWAYS_INLINE; // v8 memory manipulation friends - friend inline void load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE; - friend inline void store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void load_8x1( const void* ALIGNED( 16 ) p, + v8& a ) ALWAYS_INLINE; + friend inline void store_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void stream_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void clear_8x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; + friend inline void copy_8x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; + friend inline void swap_8x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v8 transposed memory manipulation friends // Note: Half aligned values are permissible in the 8x2_tr variants. - friend inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) ALWAYS_INLINE; - - friend inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) ALWAYS_INLINE; - - friend inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE; - - friend inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE; - - friend inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE; - - friend inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE; - - friend inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3, - void * ALIGNED(8) a4, - void * ALIGNED(8) a5, - void * ALIGNED(8) a6, - void * ALIGNED(8) a7 ) ALWAYS_INLINE; - - friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x4_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x8_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - const v8 &e, const v8 &f, - const v8 &g, const v8 &h, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; + friend inline void load_8x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + const void* a4, const void* a5, + const void* a6, const void* a7, + v8& a ) ALWAYS_INLINE; + + friend inline void + load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, + v8& a, v8& b ) ALWAYS_INLINE; + + friend inline void + load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c ) ALWAYS_INLINE; + + friend inline void + load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d ) ALWAYS_INLINE; + + friend inline void + load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, + v8& h ) ALWAYS_INLINE; + + friend inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, + void* a3, void* a4, void* a5, void* a6, + void* a7 ) ALWAYS_INLINE; + + friend inline void + store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) ALWAYS_INLINE; + + friend inline void + store_8x3_tr( const v8& a, const v8& b, const v8& c, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, + void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x4_tr( + const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x8_tr( + const v8& a, const v8& b, const v8& c, const v8& d, const v8& e, + const v8& f, const v8& g, const v8& h, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; protected: - - union - { - int i[8]; - float f[8]; + union { + int i[8]; + float f[8]; }; public: + v8() {} // Default constructor - v8() {} // Default constructor - - v8( const v8 &a ) // Copy constructor + v8( const v8& a ) // Copy constructor { - i[0]=a.i[0]; i[1]=a.i[1]; i[2]=a.i[2]; i[3]=a.i[3]; - i[4]=a.i[4]; i[5]=a.i[5]; i[6]=a.i[6]; i[7]=a.i[7]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; } - ~v8() {} // Default destructor - }; + ~v8() {} // Default destructor +}; - // v8 miscellaneous functions +// v8 miscellaneous functions - inline int any( const v8 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3] || - a.i[4] || a.i[5] || a.i[6] || a.i[7]; - } +inline int any( const v8& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7]; +} - inline int all( const v8 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3] && - a.i[4] && a.i[5] && a.i[6] && a.i[7]; - } +inline int all( const v8& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7]; +} - template - inline v8 splat( const v8 & a ) - { +template +inline v8 splat( const v8& a ) +{ v8 b; b.i[0] = a.i[n]; @@ -209,11 +188,11 @@ namespace v8 b.i[7] = a.i[n]; return b; - } +} - template - inline v8 shuffle( const v8 & a ) - { +template +inline v8 shuffle( const v8& a ) +{ v8 b; b.i[0] = a.i[i0]; @@ -226,12 +205,12 @@ namespace v8 b.i[7] = a.i[i7]; return b; - } +} -# define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v8 &a, v8 &b ) - { +inline void swap( v8& a, v8& b ) +{ sw( a.i[0], b.i[0] ); sw( a.i[1], b.i[1] ); sw( a.i[2], b.i[2] ); @@ -240,867 +219,912 @@ namespace v8 sw( a.i[5], b.i[5] ); sw( a.i[6], b.i[6] ); sw( a.i[7], b.i[7] ); - } - - inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) - { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); sw( a0.i[4],a4.i[0] ); sw( a0.i[5],a5.i[0] ); sw( a0.i[6],a6.i[0] ); sw( a0.i[7],a7.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a1.i[4],a4.i[1] ); sw( a1.i[5],a5.i[1] ); sw( a1.i[6],a6.i[1] ); sw( a1.i[7],a7.i[1] ); - sw( a2.i[3],a3.i[2] ); sw( a2.i[4],a4.i[2] ); sw( a2.i[5],a5.i[2] ); sw( a2.i[6],a6.i[2] ); sw( a2.i[7],a7.i[2] ); - sw( a3.i[4],a4.i[3] ); sw( a3.i[5],a5.i[3] ); sw( a3.i[6],a6.i[3] ); sw( a3.i[7],a7.i[3] ); - sw( a4.i[5],a5.i[4] ); sw( a4.i[6],a6.i[4] ); sw( a4.i[7],a7.i[4] ); - sw( a5.i[6],a6.i[5] ); sw( a5.i[7],a7.i[5] ); - sw( a6.i[7],a7.i[6] ); - } - -# undef sw - - // v8 memory manipulation functions - - inline void load_8x1( const void * ALIGNED(16) p, - v8 &a ) - { - a.i[0] = ((const int * ALIGNED(16))p)[0]; - a.i[1] = ((const int * ALIGNED(16))p)[1]; - a.i[2] = ((const int * ALIGNED(16))p)[2]; - a.i[3] = ((const int * ALIGNED(16))p)[3]; - a.i[4] = ((const int * ALIGNED(16))p)[4]; - a.i[5] = ((const int * ALIGNED(16))p)[5]; - a.i[6] = ((const int * ALIGNED(16))p)[6]; - a.i[7] = ((const int * ALIGNED(16))p)[7]; - } - - inline void store_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void stream_8x1( const v8 &a, - void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = a.i[0]; - ((int * ALIGNED(16))p)[1] = a.i[1]; - ((int * ALIGNED(16))p)[2] = a.i[2]; - ((int * ALIGNED(16))p)[3] = a.i[3]; - ((int * ALIGNED(16))p)[4] = a.i[4]; - ((int * ALIGNED(16))p)[5] = a.i[5]; - ((int * ALIGNED(16))p)[6] = a.i[6]; - ((int * ALIGNED(16))p)[7] = a.i[7]; - } - - inline void clear_8x1( void * ALIGNED(16) p ) - { - ((int * ALIGNED(16))p)[0] = 0; - ((int * ALIGNED(16))p)[1] = 0; - ((int * ALIGNED(16))p)[2] = 0; - ((int * ALIGNED(16))p)[3] = 0; - ((int * ALIGNED(16))p)[4] = 0; - ((int * ALIGNED(16))p)[5] = 0; - ((int * ALIGNED(16))p)[6] = 0; - ((int * ALIGNED(16))p)[7] = 0; - } - - // FIXME: Ordering semantics - inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { - ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0]; - ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1]; - ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2]; - ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3]; - ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4]; - ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5]; - ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6]; - ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7]; - } - - inline void swap_8x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +} + +inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, v8& a5, v8& a6, + v8& a7 ) +{ + sw( a0.i[1], a1.i[0] ); + sw( a0.i[2], a2.i[0] ); + sw( a0.i[3], a3.i[0] ); + sw( a0.i[4], a4.i[0] ); + sw( a0.i[5], a5.i[0] ); + sw( a0.i[6], a6.i[0] ); + sw( a0.i[7], a7.i[0] ); + sw( a1.i[2], a2.i[1] ); + sw( a1.i[3], a3.i[1] ); + sw( a1.i[4], a4.i[1] ); + sw( a1.i[5], a5.i[1] ); + sw( a1.i[6], a6.i[1] ); + sw( a1.i[7], a7.i[1] ); + sw( a2.i[3], a3.i[2] ); + sw( a2.i[4], a4.i[2] ); + sw( a2.i[5], a5.i[2] ); + sw( a2.i[6], a6.i[2] ); + sw( a2.i[7], a7.i[2] ); + sw( a3.i[4], a4.i[3] ); + sw( a3.i[5], a5.i[3] ); + sw( a3.i[6], a6.i[3] ); + sw( a3.i[7], a7.i[3] ); + sw( a4.i[5], a5.i[4] ); + sw( a4.i[6], a6.i[4] ); + sw( a4.i[7], a7.i[4] ); + sw( a5.i[6], a6.i[5] ); + sw( a5.i[7], a7.i[5] ); + sw( a6.i[7], a7.i[6] ); +} + +#undef sw + +// v8 memory manipulation functions + +inline void load_8x1( const void* ALIGNED( 16 ) p, v8& a ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))p )[0]; + a.i[1] = ( (const int* ALIGNED( 16 ))p )[1]; + a.i[2] = ( (const int* ALIGNED( 16 ))p )[2]; + a.i[3] = ( (const int* ALIGNED( 16 ))p )[3]; + a.i[4] = ( (const int* ALIGNED( 16 ))p )[4]; + a.i[5] = ( (const int* ALIGNED( 16 ))p )[5]; + a.i[6] = ( (const int* ALIGNED( 16 ))p )[6]; + a.i[7] = ( (const int* ALIGNED( 16 ))p )[7]; +} + +inline void store_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void stream_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))p )[1] = a.i[1]; + ( (int* ALIGNED( 16 ))p )[2] = a.i[2]; + ( (int* ALIGNED( 16 ))p )[3] = a.i[3]; + ( (int* ALIGNED( 16 ))p )[4] = a.i[4]; + ( (int* ALIGNED( 16 ))p )[5] = a.i[5]; + ( (int* ALIGNED( 16 ))p )[6] = a.i[6]; + ( (int* ALIGNED( 16 ))p )[7] = a.i[7]; +} + +inline void clear_8x1( void* ALIGNED( 16 ) p ) +{ + ( (int* ALIGNED( 16 ))p )[0] = 0; + ( (int* ALIGNED( 16 ))p )[1] = 0; + ( (int* ALIGNED( 16 ))p )[2] = 0; + ( (int* ALIGNED( 16 ))p )[3] = 0; + ( (int* ALIGNED( 16 ))p )[4] = 0; + ( (int* ALIGNED( 16 ))p )[5] = 0; + ( (int* ALIGNED( 16 ))p )[6] = 0; + ( (int* ALIGNED( 16 ))p )[7] = 0; +} + +// FIXME: Ordering semantics +inline void copy_8x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ + ( (int* ALIGNED( 16 ))dst )[0] = ( (const int* ALIGNED( 16 ))src )[0]; + ( (int* ALIGNED( 16 ))dst )[1] = ( (const int* ALIGNED( 16 ))src )[1]; + ( (int* ALIGNED( 16 ))dst )[2] = ( (const int* ALIGNED( 16 ))src )[2]; + ( (int* ALIGNED( 16 ))dst )[3] = ( (const int* ALIGNED( 16 ))src )[3]; + ( (int* ALIGNED( 16 ))dst )[4] = ( (const int* ALIGNED( 16 ))src )[4]; + ( (int* ALIGNED( 16 ))dst )[5] = ( (const int* ALIGNED( 16 ))src )[5]; + ( (int* ALIGNED( 16 ))dst )[6] = ( (const int* ALIGNED( 16 ))src )[6]; + ( (int* ALIGNED( 16 ))dst )[7] = ( (const int* ALIGNED( 16 ))src )[7]; +} + +inline void swap_8x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; - t = ((int * ALIGNED(16))a)[0]; - ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; - ((int * ALIGNED(16))b)[0] = t; - - t = ((int * ALIGNED(16))a)[1]; - ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; - ((int * ALIGNED(16))b)[1] = t; - - t = ((int * ALIGNED(16))a)[2]; - ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; - ((int * ALIGNED(16))b)[2] = t; - - t = ((int * ALIGNED(16))a)[3]; - ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; - ((int * ALIGNED(16))b)[3] = t; - - t = ((int * ALIGNED(16))a)[4]; - ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4]; - ((int * ALIGNED(16))b)[4] = t; - - t = ((int * ALIGNED(16))a)[5]; - ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5]; - ((int * ALIGNED(16))b)[5] = t; - - t = ((int * ALIGNED(16))a)[6]; - ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6]; - ((int * ALIGNED(16))b)[6] = t; - - t = ((int * ALIGNED(16))a)[7]; - ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7]; - ((int * ALIGNED(16))b)[7] = t; - } - - // v8 transposed memory manipulation functions - - inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) - { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; - a.i[4] = ((const int *)a4)[0]; - a.i[5] = ((const int *)a5)[0]; - a.i[6] = ((const int *)a6)[0]; - a.i[7] = ((const int *)a7)[0]; - } - - inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) - { - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; - - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; - - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; - - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; - - a.i[4] = ((const int * ALIGNED(8))a4)[0]; - b.i[4] = ((const int * ALIGNED(8))a4)[1]; - - a.i[5] = ((const int * ALIGNED(8))a5)[0]; - b.i[5] = ((const int * ALIGNED(8))a5)[1]; - - a.i[6] = ((const int * ALIGNED(8))a6)[0]; - b.i[6] = ((const int * ALIGNED(8))a6)[1]; - - a.i[7] = ((const int * ALIGNED(8))a7)[0]; - b.i[7] = ((const int * ALIGNED(8))a7)[1]; - } - - inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - } - - inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - d.i[4] = ((const int * ALIGNED(16))a4)[3]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - d.i[5] = ((const int * ALIGNED(16))a5)[3]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - d.i[6] = ((const int * ALIGNED(16))a6)[3]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - d.i[7] = ((const int * ALIGNED(16))a7)[3]; - } - - inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - e.i[0] = ((const int * ALIGNED(16))a0)[4]; - f.i[0] = ((const int * ALIGNED(16))a0)[5]; - g.i[0] = ((const int * ALIGNED(16))a0)[6]; - h.i[0] = ((const int * ALIGNED(16))a0)[7]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - e.i[1] = ((const int * ALIGNED(16))a1)[4]; - f.i[1] = ((const int * ALIGNED(16))a1)[5]; - g.i[1] = ((const int * ALIGNED(16))a1)[6]; - h.i[1] = ((const int * ALIGNED(16))a1)[7]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - e.i[2] = ((const int * ALIGNED(16))a2)[4]; - f.i[2] = ((const int * ALIGNED(16))a2)[5]; - g.i[2] = ((const int * ALIGNED(16))a2)[6]; - h.i[2] = ((const int * ALIGNED(16))a2)[7]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - e.i[3] = ((const int * ALIGNED(16))a3)[4]; - f.i[3] = ((const int * ALIGNED(16))a3)[5]; - g.i[3] = ((const int * ALIGNED(16))a3)[6]; - h.i[3] = ((const int * ALIGNED(16))a3)[7]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - d.i[4] = ((const int * ALIGNED(16))a4)[3]; - e.i[4] = ((const int * ALIGNED(16))a4)[4]; - f.i[4] = ((const int * ALIGNED(16))a4)[5]; - g.i[4] = ((const int * ALIGNED(16))a4)[6]; - h.i[4] = ((const int * ALIGNED(16))a4)[7]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - d.i[5] = ((const int * ALIGNED(16))a5)[3]; - e.i[5] = ((const int * ALIGNED(16))a5)[4]; - f.i[5] = ((const int * ALIGNED(16))a5)[5]; - g.i[5] = ((const int * ALIGNED(16))a5)[6]; - h.i[5] = ((const int * ALIGNED(16))a5)[7]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - d.i[6] = ((const int * ALIGNED(16))a6)[3]; - e.i[6] = ((const int * ALIGNED(16))a6)[4]; - f.i[6] = ((const int * ALIGNED(16))a6)[5]; - g.i[6] = ((const int * ALIGNED(16))a6)[6]; - h.i[6] = ((const int * ALIGNED(16))a6)[7]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - d.i[7] = ((const int * ALIGNED(16))a7)[3]; - e.i[7] = ((const int * ALIGNED(16))a7)[4]; - f.i[7] = ((const int * ALIGNED(16))a7)[5]; - g.i[7] = ((const int * ALIGNED(16))a7)[6]; - h.i[7] = ((const int * ALIGNED(16))a7)[7]; - } - - inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) - { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; - ((int *)a4)[0] = a.i[4]; - ((int *)a5)[0] = a.i[5]; - ((int *)a6)[0] = a.i[6]; - ((int *)a7)[0] = a.i[7]; - } - - inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3, - void * ALIGNED(8) a4, void * ALIGNED(8) a5, - void * ALIGNED(8) a6, void * ALIGNED(8) a7 ) - { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; - - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; - - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; - - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; - - ((int * ALIGNED(8))a4)[0] = a.i[4]; - ((int * ALIGNED(8))a4)[1] = b.i[4]; - - ((int * ALIGNED(8))a5)[0] = a.i[5]; - ((int * ALIGNED(8))a5)[1] = b.i[5]; - - ((int * ALIGNED(8))a6)[0] = a.i[6]; - ((int * ALIGNED(8))a6)[1] = b.i[6]; - - ((int * ALIGNED(8))a7)[0] = a.i[7]; - ((int * ALIGNED(8))a7)[1] = b.i[7]; - } - - inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - } - - inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - ((int * ALIGNED(16))a4)[3] = d.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - ((int * ALIGNED(16))a5)[3] = d.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - ((int * ALIGNED(16))a6)[3] = d.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - ((int * ALIGNED(16))a7)[3] = d.i[7]; - } - - inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - const v8 &e, const v8 &f, const v8 &g, const v8 &h, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - ((int * ALIGNED(16))a0)[4] = e.i[0]; - ((int * ALIGNED(16))a0)[5] = f.i[0]; - ((int * ALIGNED(16))a0)[6] = g.i[0]; - ((int * ALIGNED(16))a0)[7] = h.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - ((int * ALIGNED(16))a1)[4] = e.i[1]; - ((int * ALIGNED(16))a1)[5] = f.i[1]; - ((int * ALIGNED(16))a1)[6] = g.i[1]; - ((int * ALIGNED(16))a1)[7] = h.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - ((int * ALIGNED(16))a2)[4] = e.i[2]; - ((int * ALIGNED(16))a2)[5] = f.i[2]; - ((int * ALIGNED(16))a2)[6] = g.i[2]; - ((int * ALIGNED(16))a2)[7] = h.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; - ((int * ALIGNED(16))a3)[4] = e.i[3]; - ((int * ALIGNED(16))a3)[5] = f.i[3]; - ((int * ALIGNED(16))a3)[6] = g.i[3]; - ((int * ALIGNED(16))a3)[7] = h.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - ((int * ALIGNED(16))a4)[3] = d.i[4]; - ((int * ALIGNED(16))a4)[4] = e.i[4]; - ((int * ALIGNED(16))a4)[5] = f.i[4]; - ((int * ALIGNED(16))a4)[6] = g.i[4]; - ((int * ALIGNED(16))a4)[7] = h.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - ((int * ALIGNED(16))a5)[3] = d.i[5]; - ((int * ALIGNED(16))a5)[4] = e.i[5]; - ((int * ALIGNED(16))a5)[5] = f.i[5]; - ((int * ALIGNED(16))a5)[6] = g.i[5]; - ((int * ALIGNED(16))a5)[7] = h.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - ((int * ALIGNED(16))a6)[3] = d.i[6]; - ((int * ALIGNED(16))a6)[4] = e.i[6]; - ((int * ALIGNED(16))a6)[5] = f.i[6]; - ((int * ALIGNED(16))a6)[6] = g.i[6]; - ((int * ALIGNED(16))a6)[7] = h.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - ((int * ALIGNED(16))a7)[3] = d.i[7]; - ((int * ALIGNED(16))a7)[4] = e.i[7]; - ((int * ALIGNED(16))a7)[5] = f.i[7]; - ((int * ALIGNED(16))a7)[6] = g.i[7]; - ((int * ALIGNED(16))a7)[7] = h.i[7]; - } - - ////////////// - // v8int class - - class v8int : public v8 - { + t = ( (int* ALIGNED( 16 ))a )[0]; + ( (int* ALIGNED( 16 ))a )[0] = ( (int* ALIGNED( 16 ))b )[0]; + ( (int* ALIGNED( 16 ))b )[0] = t; + + t = ( (int* ALIGNED( 16 ))a )[1]; + ( (int* ALIGNED( 16 ))a )[1] = ( (int* ALIGNED( 16 ))b )[1]; + ( (int* ALIGNED( 16 ))b )[1] = t; + + t = ( (int* ALIGNED( 16 ))a )[2]; + ( (int* ALIGNED( 16 ))a )[2] = ( (int* ALIGNED( 16 ))b )[2]; + ( (int* ALIGNED( 16 ))b )[2] = t; + + t = ( (int* ALIGNED( 16 ))a )[3]; + ( (int* ALIGNED( 16 ))a )[3] = ( (int* ALIGNED( 16 ))b )[3]; + ( (int* ALIGNED( 16 ))b )[3] = t; + + t = ( (int* ALIGNED( 16 ))a )[4]; + ( (int* ALIGNED( 16 ))a )[4] = ( (int* ALIGNED( 16 ))b )[4]; + ( (int* ALIGNED( 16 ))b )[4] = t; + + t = ( (int* ALIGNED( 16 ))a )[5]; + ( (int* ALIGNED( 16 ))a )[5] = ( (int* ALIGNED( 16 ))b )[5]; + ( (int* ALIGNED( 16 ))b )[5] = t; + + t = ( (int* ALIGNED( 16 ))a )[6]; + ( (int* ALIGNED( 16 ))a )[6] = ( (int* ALIGNED( 16 ))b )[6]; + ( (int* ALIGNED( 16 ))b )[6] = t; + + t = ( (int* ALIGNED( 16 ))a )[7]; + ( (int* ALIGNED( 16 ))a )[7] = ( (int* ALIGNED( 16 ))b )[7]; + ( (int* ALIGNED( 16 ))b )[7] = t; +} + +// v8 transposed memory manipulation functions + +inline void load_8x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, const void* a4, const void* a5, + const void* a6, const void* a7, v8& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; + a.i[4] = ( (const int*)a4 )[0]; + a.i[5] = ( (const int*)a5 )[0]; + a.i[6] = ( (const int*)a6 )[0]; + a.i[7] = ( (const int*)a7 )[0]; +} + +inline void +load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, v8& a, + v8& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a0 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a1 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a2 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a3 )[1]; + + a.i[4] = ( (const int* ALIGNED( 8 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 8 ))a4 )[1]; + + a.i[5] = ( (const int* ALIGNED( 8 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 8 ))a5 )[1]; + + a.i[6] = ( (const int* ALIGNED( 8 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 8 ))a6 )[1]; + + a.i[7] = ( (const int* ALIGNED( 8 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 8 ))a7 )[1]; +} + +inline void +load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; +} + +inline void +load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + d.i[4] = ( (const int* ALIGNED( 16 ))a4 )[3]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + d.i[5] = ( (const int* ALIGNED( 16 ))a5 )[3]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + d.i[6] = ( (const int* ALIGNED( 16 ))a6 )[3]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; + d.i[7] = ( (const int* ALIGNED( 16 ))a7 )[3]; +} + +inline void +load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, v8& h ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + e.i[0] = ( (const int* ALIGNED( 16 ))a0 )[4]; + f.i[0] = ( (const int* ALIGNED( 16 ))a0 )[5]; + g.i[0] = ( (const int* ALIGNED( 16 ))a0 )[6]; + h.i[0] = ( (const int* ALIGNED( 16 ))a0 )[7]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + e.i[1] = ( (const int* ALIGNED( 16 ))a1 )[4]; + f.i[1] = ( (const int* ALIGNED( 16 ))a1 )[5]; + g.i[1] = ( (const int* ALIGNED( 16 ))a1 )[6]; + h.i[1] = ( (const int* ALIGNED( 16 ))a1 )[7]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + e.i[2] = ( (const int* ALIGNED( 16 ))a2 )[4]; + f.i[2] = ( (const int* ALIGNED( 16 ))a2 )[5]; + g.i[2] = ( (const int* ALIGNED( 16 ))a2 )[6]; + h.i[2] = ( (const int* ALIGNED( 16 ))a2 )[7]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; + e.i[3] = ( (const int* ALIGNED( 16 ))a3 )[4]; + f.i[3] = ( (const int* ALIGNED( 16 ))a3 )[5]; + g.i[3] = ( (const int* ALIGNED( 16 ))a3 )[6]; + h.i[3] = ( (const int* ALIGNED( 16 ))a3 )[7]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + d.i[4] = ( (const int* ALIGNED( 16 ))a4 )[3]; + e.i[4] = ( (const int* ALIGNED( 16 ))a4 )[4]; + f.i[4] = ( (const int* ALIGNED( 16 ))a4 )[5]; + g.i[4] = ( (const int* ALIGNED( 16 ))a4 )[6]; + h.i[4] = ( (const int* ALIGNED( 16 ))a4 )[7]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + d.i[5] = ( (const int* ALIGNED( 16 ))a5 )[3]; + e.i[5] = ( (const int* ALIGNED( 16 ))a5 )[4]; + f.i[5] = ( (const int* ALIGNED( 16 ))a5 )[5]; + g.i[5] = ( (const int* ALIGNED( 16 ))a5 )[6]; + h.i[5] = ( (const int* ALIGNED( 16 ))a5 )[7]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + d.i[6] = ( (const int* ALIGNED( 16 ))a6 )[3]; + e.i[6] = ( (const int* ALIGNED( 16 ))a6 )[4]; + f.i[6] = ( (const int* ALIGNED( 16 ))a6 )[5]; + g.i[6] = ( (const int* ALIGNED( 16 ))a6 )[6]; + h.i[6] = ( (const int* ALIGNED( 16 ))a6 )[7]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; + d.i[7] = ( (const int* ALIGNED( 16 ))a7 )[3]; + e.i[7] = ( (const int* ALIGNED( 16 ))a7 )[4]; + f.i[7] = ( (const int* ALIGNED( 16 ))a7 )[5]; + g.i[7] = ( (const int* ALIGNED( 16 ))a7 )[6]; + h.i[7] = ( (const int* ALIGNED( 16 ))a7 )[7]; +} + +inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, void* a3, + void* a4, void* a5, void* a6, void* a7 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; + ( (int*)a4 )[0] = a.i[4]; + ( (int*)a5 )[0] = a.i[5]; + ( (int*)a6 )[0] = a.i[6]; + ( (int*)a7 )[0] = a.i[7]; +} + +inline void store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) +{ + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; + + ( (int* ALIGNED( 8 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a4 )[1] = b.i[4]; + + ( (int* ALIGNED( 8 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a5 )[1] = b.i[5]; + + ( (int* ALIGNED( 8 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a6 )[1] = b.i[6]; + + ( (int* ALIGNED( 8 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a7 )[1] = b.i[7]; +} + +inline void store_8x3_tr( const v8& a, const v8& b, const v8& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; +} + +inline void store_8x4_tr( const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + ( (int* ALIGNED( 16 ))a4 )[3] = d.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + ( (int* ALIGNED( 16 ))a5 )[3] = d.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + ( (int* ALIGNED( 16 ))a6 )[3] = d.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; + ( (int* ALIGNED( 16 ))a7 )[3] = d.i[7]; +} + +inline void store_8x8_tr( const v8& a, const v8& b, const v8& c, const v8& d, + const v8& e, const v8& f, const v8& g, const v8& h, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + ( (int* ALIGNED( 16 ))a0 )[4] = e.i[0]; + ( (int* ALIGNED( 16 ))a0 )[5] = f.i[0]; + ( (int* ALIGNED( 16 ))a0 )[6] = g.i[0]; + ( (int* ALIGNED( 16 ))a0 )[7] = h.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + ( (int* ALIGNED( 16 ))a1 )[4] = e.i[1]; + ( (int* ALIGNED( 16 ))a1 )[5] = f.i[1]; + ( (int* ALIGNED( 16 ))a1 )[6] = g.i[1]; + ( (int* ALIGNED( 16 ))a1 )[7] = h.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + ( (int* ALIGNED( 16 ))a2 )[4] = e.i[2]; + ( (int* ALIGNED( 16 ))a2 )[5] = f.i[2]; + ( (int* ALIGNED( 16 ))a2 )[6] = g.i[2]; + ( (int* ALIGNED( 16 ))a2 )[7] = h.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; + ( (int* ALIGNED( 16 ))a3 )[4] = e.i[3]; + ( (int* ALIGNED( 16 ))a3 )[5] = f.i[3]; + ( (int* ALIGNED( 16 ))a3 )[6] = g.i[3]; + ( (int* ALIGNED( 16 ))a3 )[7] = h.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + ( (int* ALIGNED( 16 ))a4 )[3] = d.i[4]; + ( (int* ALIGNED( 16 ))a4 )[4] = e.i[4]; + ( (int* ALIGNED( 16 ))a4 )[5] = f.i[4]; + ( (int* ALIGNED( 16 ))a4 )[6] = g.i[4]; + ( (int* ALIGNED( 16 ))a4 )[7] = h.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + ( (int* ALIGNED( 16 ))a5 )[3] = d.i[5]; + ( (int* ALIGNED( 16 ))a5 )[4] = e.i[5]; + ( (int* ALIGNED( 16 ))a5 )[5] = f.i[5]; + ( (int* ALIGNED( 16 ))a5 )[6] = g.i[5]; + ( (int* ALIGNED( 16 ))a5 )[7] = h.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + ( (int* ALIGNED( 16 ))a6 )[3] = d.i[6]; + ( (int* ALIGNED( 16 ))a6 )[4] = e.i[6]; + ( (int* ALIGNED( 16 ))a6 )[5] = f.i[6]; + ( (int* ALIGNED( 16 ))a6 )[6] = g.i[6]; + ( (int* ALIGNED( 16 ))a6 )[7] = h.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; + ( (int* ALIGNED( 16 ))a7 )[3] = d.i[7]; + ( (int* ALIGNED( 16 ))a7 )[4] = e.i[7]; + ( (int* ALIGNED( 16 ))a7 )[5] = f.i[7]; + ( (int* ALIGNED( 16 ))a7 )[6] = g.i[7]; + ( (int* ALIGNED( 16 ))a7 )[7] = h.i[7]; +} + +////////////// +// v8int class + +class v8int : public v8 +{ // v8int prefix unary operator friends - friend inline v8int operator +( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator ~( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8int & a ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator~( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8int prefix increment / decrement operator friends - friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a ) ALWAYS_INLINE; // v8int postfix increment / decrement operator friends - friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a, int ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a, int ) ALWAYS_INLINE; // v8int binary operator friends - friend inline v8int operator +( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator *( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator /( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator %( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ^( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator |( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator*(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator/( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator%( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator^( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator|( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int logical operator friends - friend inline v8int operator <( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8int abs( const v8int &a ) ALWAYS_INLINE; - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; + friend inline v8int abs( const v8int& a ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& t, + const v8& f ) ALWAYS_INLINE; // v8float unary operator friends - friend inline v8int operator !( const v8float & a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float miscellaneous friends - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; public: - // v8int constructors / destructors - v8int() {} // Default constructor + v8int() {} // Default constructor - v8int( const v8int &a ) // Copy constructor + v8int( const v8int& a ) // Copy constructor { - i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3]; - i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; } - v8int( const v8 &a ) // Init from mixed + v8int( const v8& a ) // Init from mixed { - i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3]; - i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7]; + i[0] = a.i[0]; + i[1] = a.i[1]; + i[2] = a.i[2]; + i[3] = a.i[3]; + i[4] = a.i[4]; + i[5] = a.i[5]; + i[6] = a.i[6]; + i[7] = a.i[7]; } - v8int( int a ) // Init from scalar + v8int( int a ) // Init from scalar { - i[0] = a; i[1] = a; i[2] = a; i[3] = a; - i[4] = a; i[5] = a; i[6] = a; i[7] = a; + i[0] = a; + i[1] = a; + i[2] = a; + i[3] = a; + i[4] = a; + i[5] = a; + i[6] = a; + i[7] = a; } - v8int( int i0, int i1, int i2, int i3, - int i4, int i5, int i6, int i7 ) // Init from scalars + v8int( int i0, int i1, int i2, int i3, int i4, int i5, int i6, + int i7 ) // Init from scalars { - i[0] = i0; i[1] = i1; i[2] = i2; i[3] = i3; - i[4] = i4; i[5] = i5; i[6] = i6; i[7] = i7; + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; + i[4] = i4; + i[5] = i5; + i[6] = i6; + i[7] = i7; } - ~v8int() {} // Destructor + ~v8int() {} // Destructor // v8int assignment operators -# define ASSIGN(op) \ - inline v8int &operator op( const v8int &b ) \ - { \ - i[0] op b.i[0]; \ - i[1] op b.i[1]; \ - i[2] op b.i[2]; \ - i[3] op b.i[3]; \ - i[4] op b.i[4]; \ - i[5] op b.i[5]; \ - i[6] op b.i[6]; \ - i[7] op b.i[7]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8int& operator op( const v8int& b ) \ + { \ + i[0] op b.i[0]; \ + i[1] op b.i[1]; \ + i[2] op b.i[2]; \ + i[3] op b.i[3]; \ + i[4] op b.i[4]; \ + i[5] op b.i[5]; \ + i[6] op b.i[6]; \ + i[7] op b.i[7]; \ + return *this; \ } - ASSIGN( =) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v8int member access operator - inline int &operator []( int n ) - { - return i[n]; + inline int& operator[]( int n ) { return i[n]; } + + inline int operator()( int n ) { return i[n]; } +}; + +// v8int prefix unary operators + +#define PREFIX_UNARY( op ) \ + inline v8int operator op( const v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ } - inline int operator ()( int n ) - { - return i[n]; - } - }; - - // v8int prefix unary operators - -# define PREFIX_UNARY(op) \ - inline v8int operator op( const v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } - - PREFIX_UNARY(+) - PREFIX_UNARY(-) - - inline v8int operator !( const v8int & a ) - { +PREFIX_UNARY( +) +PREFIX_UNARY( -) + +inline v8int operator!( const v8int& a ) +{ v8int b; - b.i[0] = - ( !a.i[0] ); - b.i[1] = - ( !a.i[1] ); - b.i[2] = - ( !a.i[2] ); - b.i[3] = - ( !a.i[3] ); - b.i[4] = - ( !a.i[4] ); - b.i[5] = - ( !a.i[5] ); - b.i[6] = - ( !a.i[6] ); - b.i[7] = - ( !a.i[7] ); + b.i[0] = -( !a.i[0] ); + b.i[1] = -( !a.i[1] ); + b.i[2] = -( !a.i[2] ); + b.i[3] = -( !a.i[3] ); + b.i[4] = -( !a.i[4] ); + b.i[5] = -( !a.i[5] ); + b.i[6] = -( !a.i[6] ); + b.i[7] = -( !a.i[7] ); return b; - } - - PREFIX_UNARY(~) - -# undef PREFIX_UNARY - - // v8int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v8int operator op( v8int & a ) \ - { \ - v8int b; \ - b.i[0] = ( op a.i[0] ); \ - b.i[1] = ( op a.i[1] ); \ - b.i[2] = ( op a.i[2] ); \ - b.i[3] = ( op a.i[3] ); \ - b.i[4] = ( op a.i[4] ); \ - b.i[5] = ( op a.i[5] ); \ - b.i[6] = ( op a.i[6] ); \ - b.i[7] = ( op a.i[7] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v8int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v8int operator op( v8int & a, int ) \ - { \ - v8int b; \ - b.i[0] = ( a.i[0] op ); \ - b.i[1] = ( a.i[1] op ); \ - b.i[2] = ( a.i[2] op ); \ - b.i[3] = ( a.i[3] op ); \ - b.i[4] = ( a.i[4] op ); \ - b.i[5] = ( a.i[5] op ); \ - b.i[6] = ( a.i[6] op ); \ - b.i[7] = ( a.i[7] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v8int binary operators - -# define BINARY(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = a.i[0] op b.i[0]; \ - c.i[1] = a.i[1] op b.i[1]; \ - c.i[2] = a.i[2] op b.i[2]; \ - c.i[3] = a.i[3] op b.i[3]; \ - c.i[4] = a.i[4] op b.i[4]; \ - c.i[5] = a.i[5] op b.i[5]; \ - c.i[6] = a.i[6] op b.i[6]; \ - c.i[7] = a.i[7] op b.i[7]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v8int logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - c.i[0] = - ( a.i[0] op b.i[0] ); \ - c.i[1] = - ( a.i[1] op b.i[1] ); \ - c.i[2] = - ( a.i[2] op b.i[2] ); \ - c.i[3] = - ( a.i[3] op b.i[3] ); \ - c.i[4] = - ( a.i[4] op b.i[4] ); \ - c.i[5] = - ( a.i[5] op b.i[5] ); \ - c.i[6] = - ( a.i[6] op b.i[6] ); \ - c.i[7] = - ( a.i[7] op b.i[7] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v8int miscellaneous functions - - inline v8int abs( const v8int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v8int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a ) \ + { \ + v8int b; \ + b.i[0] = ( op a.i[0] ); \ + b.i[1] = ( op a.i[1] ); \ + b.i[2] = ( op a.i[2] ); \ + b.i[3] = ( op a.i[3] ); \ + b.i[4] = ( op a.i[4] ); \ + b.i[5] = ( op a.i[5] ); \ + b.i[6] = ( op a.i[6] ); \ + b.i[7] = ( op a.i[7] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v8int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a, int ) \ + { \ + v8int b; \ + b.i[0] = ( a.i[0] op ); \ + b.i[1] = ( a.i[1] op ); \ + b.i[2] = ( a.i[2] op ); \ + b.i[3] = ( a.i[3] op ); \ + b.i[4] = ( a.i[4] op ); \ + b.i[5] = ( a.i[5] op ); \ + b.i[6] = ( a.i[6] op ); \ + b.i[7] = ( a.i[7] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v8int binary operators + +#define BINARY( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = a.i[0] op b.i[0]; \ + c.i[1] = a.i[1] op b.i[1]; \ + c.i[2] = a.i[2] op b.i[2]; \ + c.i[3] = a.i[3] op b.i[3]; \ + c.i[4] = a.i[4] op b.i[4]; \ + c.i[5] = a.i[5] op b.i[5]; \ + c.i[6] = a.i[6] op b.i[6]; \ + c.i[7] = a.i[7] op b.i[7]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( ^) +BINARY( & ) +BINARY( | ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v8int logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + c.i[0] = -( a.i[0] op b.i[0] ); \ + c.i[1] = -( a.i[1] op b.i[1] ); \ + c.i[2] = -( a.i[2] op b.i[2] ); \ + c.i[3] = -( a.i[3] op b.i[3] ); \ + c.i[4] = -( a.i[4] op b.i[4] ); \ + c.i[5] = -( a.i[5] op b.i[5] ); \ + c.i[6] = -( a.i[6] op b.i[6] ); \ + c.i[7] = -( a.i[7] op b.i[7] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v8int miscellaneous functions + +inline v8int abs( const v8int& a ) +{ v8int b; b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0]; @@ -1113,10 +1137,10 @@ namespace v8 b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7]; return b; - } +} - inline v8 czero( const v8int &c, const v8 &a ) - { +inline v8 czero( const v8int& c, const v8& a ) +{ v8 b; b.i[0] = a.i[0] & ~c.i[0]; @@ -1129,10 +1153,10 @@ namespace v8 b.i[7] = a.i[7] & ~c.i[7]; return b; - } +} - inline v8 notczero( const v8int &c, const v8 &a ) - { +inline v8 notczero( const v8int& c, const v8& a ) +{ v8 b; b.i[0] = a.i[0] & c.i[0]; @@ -1145,10 +1169,10 @@ namespace v8 b.i[7] = a.i[7] & c.i[7]; return b; - } +} - inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) - { +inline v8 merge( const v8int& c, const v8& t, const v8& f ) +{ v8 m; m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] ); @@ -1161,180 +1185,209 @@ namespace v8 m.i[7] = ( f.i[7] & ~c.i[7] ) | ( t.i[7] & c.i[7] ); return m; - } +} - //////////////// - // v8float class +//////////////// +// v8float class - class v8float : public v8 - { +class v8float : public v8 +{ // v8float prefix unary operator friends - friend inline v8float operator +( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator ~( const v8float &a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8float &a ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator~( const v8float& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8float prefix increment / decrement operator friends - friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a ) ALWAYS_INLINE; // v8float postfix increment / decrement operator friends - friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a, int ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a, int ) ALWAYS_INLINE; // v8float binary operator friends - friend inline v8float operator +( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator *( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator /( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator*(const v8float& a, + const v8float& b)ALWAYS_INLINE; + friend inline v8float operator/( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float math library friends -# define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v8float fn( const v8float &a, \ - const v8float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v8float fn( const v8float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v8float fn( const v8float& a, const v8float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v8float miscellaneous friends - friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rsqrt ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; + friend inline v8float rsqrt_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rsqrt( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp( const v8float& a ) ALWAYS_INLINE; + friend inline v8float fma( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fnms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline void increment_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void decrement_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void scale_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; public: - // v8float constructors / destructors - v8float() {} // Default constructor + v8float() {} // Default constructor - v8float( const v8float &a ) // Copy constructor + v8float( const v8float& a ) // Copy constructor { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; - f[4] = a.f[4]; - f[5] = a.f[5]; - f[6] = a.f[6]; - f[7] = a.f[7]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; } - v8float( const v8 &a ) // Init from mixed + v8float( const v8& a ) // Init from mixed { - f[0] = a.f[0]; - f[1] = a.f[1]; - f[2] = a.f[2]; - f[3] = a.f[3]; - f[4] = a.f[4]; - f[5] = a.f[5]; - f[6] = a.f[6]; - f[7] = a.f[7]; + f[0] = a.f[0]; + f[1] = a.f[1]; + f[2] = a.f[2]; + f[3] = a.f[3]; + f[4] = a.f[4]; + f[5] = a.f[5]; + f[6] = a.f[6]; + f[7] = a.f[7]; } - v8float( float a ) // Init from scalar + v8float( float a ) // Init from scalar { - f[0] = a; - f[1] = a; - f[2] = a; - f[3] = a; - f[4] = a; - f[5] = a; - f[6] = a; - f[7] = a; + f[0] = a; + f[1] = a; + f[2] = a; + f[3] = a; + f[4] = a; + f[5] = a; + f[6] = a; + f[7] = a; } - v8float( float f0, float f1, float f2, float f3, - float f4, float f5, float f6, float f7 ) // Init from scalars + v8float( float f0, float f1, float f2, float f3, float f4, float f5, + float f6, float f7 ) // Init from scalars { - f[0] = f0; - f[1] = f1; - f[2] = f2; - f[3] = f3; - f[4] = f4; - f[5] = f5; - f[6] = f6; - f[7] = f7; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; + f[4] = f4; + f[5] = f5; + f[6] = f6; + f[7] = f7; } - ~v8float() {} // Destructor + ~v8float() {} // Destructor // v8float assignment operators -# define ASSIGN(op) \ - inline v8float &operator op( const v8float &b ) \ - { \ - f[0] op b.f[0]; \ - f[1] op b.f[1]; \ - f[2] op b.f[2]; \ - f[3] op b.f[3]; \ - f[4] op b.f[4]; \ - f[5] op b.f[5]; \ - f[6] op b.f[6]; \ - f[7] op b.f[7]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8float& operator op( const v8float& b ) \ + { \ + f[0] op b.f[0]; \ + f[1] op b.f[1]; \ + f[2] op b.f[2]; \ + f[3] op b.f[3]; \ + f[4] op b.f[4]; \ + f[5] op b.f[5]; \ + f[6] op b.f[6]; \ + f[7] op b.f[7]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) -# undef ASSIGN +#undef ASSIGN // v8float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v8float prefix unary operators +// v8float prefix unary operators - inline v8float operator +( const v8float &a ) - { +inline v8float operator+( const v8float& a ) +{ v8float b; b.f[0] = +a.f[0]; @@ -1347,10 +1400,10 @@ namespace v8 b.f[7] = +a.f[7]; return b; - } +} - inline v8float operator -( const v8float &a ) - { +inline v8float operator-( const v8float& a ) +{ v8float b; b.f[0] = -a.f[0]; @@ -1363,10 +1416,10 @@ namespace v8 b.f[7] = -a.f[7]; return b; - } +} - inline v8int operator !( const v8float &a ) - { +inline v8int operator!( const v8float& a ) +{ v8int b; b.i[0] = a.i[0] ? 0 : -1; @@ -1379,12 +1432,12 @@ namespace v8 b.i[7] = a.i[7] ? 0 : -1; return b; - } +} - // v8float prefix increment / decrement operators +// v8float prefix increment / decrement operators - inline v8float operator ++( v8float &a ) - { +inline v8float operator++( v8float& a ) +{ v8float b; b.f[0] = ++a.f[0]; @@ -1397,10 +1450,10 @@ namespace v8 b.f[7] = ++a.f[7]; return b; - } +} - inline v8float operator --( v8float &a ) - { +inline v8float operator--( v8float& a ) +{ v8float b; b.f[0] = --a.f[0]; @@ -1413,12 +1466,12 @@ namespace v8 b.f[7] = --a.f[7]; return b; - } +} - // v8float postfix increment / decrement operators +// v8float postfix increment / decrement operators - inline v8float operator ++( v8float &a, int ) - { +inline v8float operator++( v8float& a, int ) +{ v8float b; b.f[0] = a.f[0]++; @@ -1431,10 +1484,10 @@ namespace v8 b.f[7] = a.f[7]++; return b; - } +} - inline v8float operator --( v8float &a, int ) - { +inline v8float operator--( v8float& a, int ) +{ v8float b; b.f[0] = a.f[0]--; @@ -1447,145 +1500,155 @@ namespace v8 b.f[7] = a.f[7]--; return b; - } - - // v8float binary operators - -# define BINARY(op) \ - inline v8float operator op( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.f[0] = a.f[0] op b.f[0]; \ - c.f[1] = a.f[1] op b.f[1]; \ - c.f[2] = a.f[2] op b.f[2]; \ - c.f[3] = a.f[3] op b.f[3]; \ - c.f[4] = a.f[4] op b.f[4]; \ - c.f[5] = a.f[5] op b.f[5]; \ - c.f[6] = a.f[6] op b.f[6]; \ - c.f[7] = a.f[7] op b.f[7]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - -# undef BINARY - - // v8float logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8float &a, const v8float &b ) \ - { \ - v8int c; \ - c.i[0] = -( a.f[0] op b.f[0] ); \ - c.i[1] = -( a.f[1] op b.f[1] ); \ - c.i[2] = -( a.f[2] op b.f[2] ); \ - c.i[3] = -( a.f[3] op b.f[3] ); \ - c.i[4] = -( a.f[4] op b.f[4] ); \ - c.i[5] = -( a.f[5] op b.f[5] ); \ - c.i[6] = -( a.f[6] op b.f[6] ); \ - c.i[7] = -( a.f[7] op b.f[7] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v8float math library functions - -# define CMATH_FR1(fn) \ - inline v8float fn( const v8float &a ) \ - { \ - v8float b; \ - b.f[0] = ::fn( a.f[0] ); \ - b.f[1] = ::fn( a.f[1] ); \ - b.f[2] = ::fn( a.f[2] ); \ - b.f[3] = ::fn( a.f[3] ); \ - b.f[4] = ::fn( a.f[4] ); \ - b.f[5] = ::fn( a.f[5] ); \ - b.f[6] = ::fn( a.f[6] ); \ - b.f[7] = ::fn( a.f[7] ); \ - return b; \ - } - -# define CMATH_FR2(fn) \ - inline v8float fn( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - c.f[0] = ::fn( a.f[0], b.f[0] ); \ - c.f[1] = ::fn( a.f[1], b.f[1] ); \ - c.f[2] = ::fn( a.f[2], b.f[2] ); \ - c.f[3] = ::fn( a.f[3], b.f[3] ); \ - c.f[4] = ::fn( a.f[4], b.f[4] ); \ - c.f[5] = ::fn( a.f[5], b.f[5] ); \ - c.f[6] = ::fn( a.f[6], b.f[6] ); \ - c.f[7] = ::fn( a.f[7], b.f[7] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - inline v8float copysign( const v8float &a, const v8float &b ) - { +} + +// v8float binary operators + +#define BINARY( op ) \ + inline v8float operator op( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.f[0] = a.f[0] op b.f[0]; \ + c.f[1] = a.f[1] op b.f[1]; \ + c.f[2] = a.f[2] op b.f[2]; \ + c.f[3] = a.f[3] op b.f[3]; \ + c.f[4] = a.f[4] op b.f[4]; \ + c.f[5] = a.f[5] op b.f[5]; \ + c.f[6] = a.f[6] op b.f[6]; \ + c.f[7] = a.f[7] op b.f[7]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v8float logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8float& a, const v8float& b ) \ + { \ + v8int c; \ + c.i[0] = -( a.f[0] op b.f[0] ); \ + c.i[1] = -( a.f[1] op b.f[1] ); \ + c.i[2] = -( a.f[2] op b.f[2] ); \ + c.i[3] = -( a.f[3] op b.f[3] ); \ + c.i[4] = -( a.f[4] op b.f[4] ); \ + c.i[5] = -( a.f[5] op b.f[5] ); \ + c.i[6] = -( a.f[6] op b.f[6] ); \ + c.i[7] = -( a.f[7] op b.f[7] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v8float math library functions + +#define CMATH_FR1( fn ) \ + inline v8float fn( const v8float& a ) \ + { \ + v8float b; \ + b.f[0] = ::fn( a.f[0] ); \ + b.f[1] = ::fn( a.f[1] ); \ + b.f[2] = ::fn( a.f[2] ); \ + b.f[3] = ::fn( a.f[3] ); \ + b.f[4] = ::fn( a.f[4] ); \ + b.f[5] = ::fn( a.f[5] ); \ + b.f[6] = ::fn( a.f[6] ); \ + b.f[7] = ::fn( a.f[7] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v8float fn( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + c.f[0] = ::fn( a.f[0], b.f[0] ); \ + c.f[1] = ::fn( a.f[1], b.f[1] ); \ + c.f[2] = ::fn( a.f[2], b.f[2] ); \ + c.f[3] = ::fn( a.f[3], b.f[3] ); \ + c.f[4] = ::fn( a.f[4], b.f[4] ); \ + c.f[5] = ::fn( a.f[5], b.f[5] ); \ + c.f[6] = ::fn( a.f[6], b.f[6] ); \ + c.f[7] = ::fn( a.f[7], b.f[7] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + + inline v8float + copysign( const v8float& a, const v8float& b ) +{ v8float c; float t; t = ::fabs( a.f[0] ); - if( b.f[0] < 0 ) t = -t; + if ( b.f[0] < 0 ) + t = -t; c.f[0] = t; t = ::fabs( a.f[1] ); - if( b.f[1] < 0 ) t = -t; + if ( b.f[1] < 0 ) + t = -t; c.f[1] = t; t = ::fabs( a.f[2] ); - if( b.f[2] < 0 ) t = -t; + if ( b.f[2] < 0 ) + t = -t; c.f[2] = t; t = ::fabs( a.f[3] ); - if( b.f[3] < 0 ) t = -t; + if ( b.f[3] < 0 ) + t = -t; c.f[3] = t; t = ::fabs( a.f[4] ); - if( b.f[4] < 0 ) t = -t; + if ( b.f[4] < 0 ) + t = -t; c.f[4] = t; t = ::fabs( a.f[5] ); - if( b.f[5] < 0 ) t = -t; + if ( b.f[5] < 0 ) + t = -t; c.f[5] = t; t = ::fabs( a.f[6] ); - if( b.f[6] < 0 ) t = -t; + if ( b.f[6] < 0 ) + t = -t; c.f[6] = t; t = ::fabs( a.f[7] ); - if( b.f[7] < 0 ) t = -t; + if ( b.f[7] < 0 ) + t = -t; c.f[7] = t; return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v8float miscellaneous functions +// v8float miscellaneous functions - inline v8float rsqrt_approx( const v8float &a ) - { +inline v8float rsqrt_approx( const v8float& a ) +{ v8float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1598,10 +1661,10 @@ namespace v8 b.f[7] = ::sqrt( 1.0f / a.f[7] ); return b; - } +} - inline v8float rsqrt( const v8float &a ) - { +inline v8float rsqrt( const v8float& a ) +{ v8float b; b.f[0] = ::sqrt( 1.0f / a.f[0] ); @@ -1614,10 +1677,10 @@ namespace v8 b.f[7] = ::sqrt( 1.0f / a.f[7] ); return b; - } +} - inline v8float rcp_approx( const v8float &a ) - { +inline v8float rcp_approx( const v8float& a ) +{ v8float b; b.f[0] = 1.0f / a.f[0]; @@ -1630,10 +1693,10 @@ namespace v8 b.f[7] = 1.0f / a.f[7]; return b; - } +} - inline v8float rcp( const v8float &a ) - { +inline v8float rcp( const v8float& a ) +{ v8float b; b.f[0] = 1.0f / a.f[0]; @@ -1646,10 +1709,10 @@ namespace v8 b.f[7] = 1.0f / a.f[7]; return b; - } +} - inline v8float fma( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fma( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.f[0] = a.f[0] * b.f[0] + c.f[0]; @@ -1662,10 +1725,10 @@ namespace v8 d.f[7] = a.f[7] * b.f[7] + c.f[7]; return d; - } +} - inline v8float fms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.f[0] = a.f[0] * b.f[0] - c.f[0]; @@ -1678,10 +1741,10 @@ namespace v8 d.f[7] = a.f[7] * b.f[7] - c.f[7]; return d; - } +} - inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fnms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; d.f[0] = c.f[0] - a.f[0] * b.f[0]; @@ -1694,10 +1757,10 @@ namespace v8 d.f[7] = c.f[7] - a.f[7] * b.f[7]; return d; - } +} - inline v8float clear_bits( const v8int &m, const v8float &a ) - { +inline v8float clear_bits( const v8int& m, const v8float& a ) +{ v8float b; b.i[0] = ( ~m.i[0] ) & a.i[0]; @@ -1710,10 +1773,10 @@ namespace v8 b.i[7] = ( ~m.i[7] ) & a.i[7]; return b; - } +} - inline v8float set_bits( const v8int &m, const v8float &a ) - { +inline v8float set_bits( const v8int& m, const v8float& a ) +{ v8float b; b.i[0] = m.i[0] | a.i[0]; @@ -1726,10 +1789,10 @@ namespace v8 b.i[7] = m.i[7] | a.i[7]; return b; - } +} - inline v8float toggle_bits( const v8int &m, const v8float &a ) - { +inline v8float toggle_bits( const v8int& m, const v8float& a ) +{ v8float b; b.i[0] = m.i[0] ^ a.i[0]; @@ -1742,10 +1805,10 @@ namespace v8 b.i[7] = m.i[7] ^ a.i[7]; return b; - } +} - inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void increment_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ p[0] += a.f[0]; p[1] += a.f[1]; p[2] += a.f[2]; @@ -1754,10 +1817,10 @@ namespace v8 p[5] += a.f[5]; p[6] += a.f[6]; p[7] += a.f[7]; - } +} - inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void decrement_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ p[0] -= a.f[0]; p[1] -= a.f[1]; p[2] -= a.f[2]; @@ -1766,10 +1829,10 @@ namespace v8 p[5] -= a.f[5]; p[6] -= a.f[6]; p[7] -= a.f[7]; - } +} - inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void scale_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ p[0] *= a.f[0]; p[1] *= a.f[1]; p[2] *= a.f[2]; @@ -1778,7 +1841,7 @@ namespace v8 p[5] *= a.f[5]; p[6] *= a.f[6]; p[7] *= a.f[7]; - } +} } // namespace v8 diff --git a/src/util/v8/v8_portable_v1.h b/src/util/v8/v8_portable_v1.h index 310c1390..5af53fa2 100644 --- a/src/util/v8/v8_portable_v1.h +++ b/src/util/v8/v8_portable_v1.h @@ -11,7 +11,7 @@ #define V8_PORTABLE_ACCELERATION #ifndef ALIGNED -#define ALIGNED(n) +#define ALIGNED( n ) #endif // This does not work with gcc 5.3.1 and the -fopenmp-simd @@ -22,206 +22,177 @@ // #define ALWAYS_VECTORIZE _Pragma( "simd" ) -#define ALWAYS_VECTORIZE \ - _Pragma( "simd" ) \ - _Pragma( "vector aligned" ) +#define ALWAYS_VECTORIZE _Pragma( "simd" ) _Pragma( "vector aligned" ) -#define ALWAYS_INLINE __attribute__((always_inline)) +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) namespace v8 { - class v8; - class v8int; - class v8float; +class v8; +class v8int; +class v8float; - //////////////// - // v8 base class +//////////////// +// v8 base class - class v8 - { +class v8 +{ friend class v8int; friend class v8float; // v8 miscellaneous friends - friend inline int any( const v8 &a ) ALWAYS_INLINE; - friend inline int all( const v8 &a ) ALWAYS_INLINE; + friend inline int any( const v8& a ) ALWAYS_INLINE; + friend inline int all( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 splat( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 splat( const v8& a ) ALWAYS_INLINE; - template - friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE; + template + friend inline v8 shuffle( const v8& a ) ALWAYS_INLINE; - friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE; - friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE; + friend inline void swap( v8& a, v8& b ) ALWAYS_INLINE; + friend inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, + v8& a5, v8& a6, v8& a7 ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& a, + const v8& b ) ALWAYS_INLINE; // v8 memory manipulation friends - friend inline void load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE; - friend inline void store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE; - friend inline void clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE; - friend inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) ALWAYS_INLINE; - friend inline void swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE; + friend inline void load_8x1( const void* ALIGNED( 16 ) p, + v8& a ) ALWAYS_INLINE; + friend inline void store_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void stream_8x1( const v8& a, + void* ALIGNED( 16 ) p ) ALWAYS_INLINE; + friend inline void clear_8x1( void* ALIGNED( 16 ) dst ) ALWAYS_INLINE; + friend inline void copy_8x1( void* ALIGNED( 16 ) dst, + const void* ALIGNED( 16 ) src ) ALWAYS_INLINE; + friend inline void swap_8x1( void* ALIGNED( 16 ) a, + void* ALIGNED( 16 ) b ) ALWAYS_INLINE; // v8 transposed memory manipulation friends // Note: Half aligned values are permissible in the 8x2_tr variants. - friend inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) ALWAYS_INLINE; - - friend inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) ALWAYS_INLINE; - - friend inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE; - - friend inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE; - - friend inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE; - - friend inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE; - - friend inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, - void * ALIGNED(8) a1, - void * ALIGNED(8) a2, - void * ALIGNED(8) a3, - void * ALIGNED(8) a4, - void * ALIGNED(8) a5, - void * ALIGNED(8) a6, - void * ALIGNED(8) a7 ) ALWAYS_INLINE; - - friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x4_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; - - friend inline void store_8x8_tr( const v8 &a, const v8 &b, - const v8 &c, const v8 &d, - const v8 &e, const v8 &f, - const v8 &g, const v8 &h, - void * ALIGNED(16) a0, - void * ALIGNED(16) a1, - void * ALIGNED(16) a2, - void * ALIGNED(16) a3, - void * ALIGNED(16) a4, - void * ALIGNED(16) a5, - void * ALIGNED(16) a6, - void * ALIGNED(16) a7 ) ALWAYS_INLINE; + friend inline void load_8x1_tr( const void* a0, const void* a1, + const void* a2, const void* a3, + const void* a4, const void* a5, + const void* a6, const void* a7, + v8& a ) ALWAYS_INLINE; + + friend inline void + load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, + v8& a, v8& b ) ALWAYS_INLINE; + + friend inline void + load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c ) ALWAYS_INLINE; + + friend inline void + load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d ) ALWAYS_INLINE; + + friend inline void + load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, + v8& a, v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, + v8& h ) ALWAYS_INLINE; + + friend inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, + void* a3, void* a4, void* a5, void* a6, + void* a7 ) ALWAYS_INLINE; + + friend inline void + store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) ALWAYS_INLINE; + + friend inline void + store_8x3_tr( const v8& a, const v8& b, const v8& c, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, + void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x4_tr( + const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, + void* ALIGNED( 16 ) a3, void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; + + friend inline void store_8x8_tr( + const v8& a, const v8& b, const v8& c, const v8& d, const v8& e, + const v8& f, const v8& g, const v8& h, void* ALIGNED( 16 ) a0, + void* ALIGNED( 16 ) a1, void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, void* ALIGNED( 16 ) a6, + void* ALIGNED( 16 ) a7 ) ALWAYS_INLINE; protected: - - union - { - int i[8]; - float f[8]; + union { + int i[8]; + float f[8]; }; public: + v8() {} // Default constructor - v8() {} // Default constructor - - v8( const v8 &a ) // Copy constructor + v8( const v8& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + i[j] = a.i[j]; } - ~v8() {} // Default destructor - }; + ~v8() {} // Default destructor +}; - // v8 miscellaneous functions +// v8 miscellaneous functions - inline int any( const v8 &a ) - { - return a.i[0] || a.i[1] || a.i[2] || a.i[3] || - a.i[4] || a.i[5] || a.i[6] || a.i[7]; - } +inline int any( const v8& a ) +{ + return a.i[0] || a.i[1] || a.i[2] || a.i[3] || a.i[4] || a.i[5] || a.i[6] || + a.i[7]; +} - inline int all( const v8 &a ) - { - return a.i[0] && a.i[1] && a.i[2] && a.i[3] && - a.i[4] && a.i[5] && a.i[6] && a.i[7]; - } +inline int all( const v8& a ) +{ + return a.i[0] && a.i[1] && a.i[2] && a.i[3] && a.i[4] && a.i[5] && a.i[6] && + a.i[7]; +} - template - inline v8 splat( const v8 & a ) - { +template +inline v8 splat( const v8& a ) +{ v8 b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = a.i[n]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = a.i[n]; return b; - } +} - template - inline v8 shuffle( const v8 & a ) - { +template +inline v8 shuffle( const v8& a ) +{ v8 b; b.i[0] = a.i[i0]; @@ -234,1289 +205,1348 @@ namespace v8 b.i[7] = a.i[i7]; return b; - } +} -# define sw(x,y) x^=y, y^=x, x^=y +#define sw( x, y ) x ^= y, y ^= x, x ^= y - inline void swap( v8 &a, v8 &b ) - { +inline void swap( v8& a, v8& b ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - sw( a.i[j], b.i[j] ); - } - - inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3, - v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) - { - sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); sw( a0.i[4],a4.i[0] ); sw( a0.i[5],a5.i[0] ); sw( a0.i[6],a6.i[0] ); sw( a0.i[7],a7.i[0] ); - sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a1.i[4],a4.i[1] ); sw( a1.i[5],a5.i[1] ); sw( a1.i[6],a6.i[1] ); sw( a1.i[7],a7.i[1] ); - sw( a2.i[3],a3.i[2] ); sw( a2.i[4],a4.i[2] ); sw( a2.i[5],a5.i[2] ); sw( a2.i[6],a6.i[2] ); sw( a2.i[7],a7.i[2] ); - sw( a3.i[4],a4.i[3] ); sw( a3.i[5],a5.i[3] ); sw( a3.i[6],a6.i[3] ); sw( a3.i[7],a7.i[3] ); - sw( a4.i[5],a5.i[4] ); sw( a4.i[6],a6.i[4] ); sw( a4.i[7],a7.i[4] ); - sw( a5.i[6],a6.i[5] ); sw( a5.i[7],a7.i[5] ); - sw( a6.i[7],a7.i[6] ); - } - -# undef sw - - // v8 memory manipulation functions - - inline void load_8x1( const void * ALIGNED(16) p, - v8 &a ) - { + for ( int j = 0; j < 8; j++ ) + sw( a.i[j], b.i[j] ); +} + +inline void transpose( v8& a0, v8& a1, v8& a2, v8& a3, v8& a4, v8& a5, v8& a6, + v8& a7 ) +{ + sw( a0.i[1], a1.i[0] ); + sw( a0.i[2], a2.i[0] ); + sw( a0.i[3], a3.i[0] ); + sw( a0.i[4], a4.i[0] ); + sw( a0.i[5], a5.i[0] ); + sw( a0.i[6], a6.i[0] ); + sw( a0.i[7], a7.i[0] ); + sw( a1.i[2], a2.i[1] ); + sw( a1.i[3], a3.i[1] ); + sw( a1.i[4], a4.i[1] ); + sw( a1.i[5], a5.i[1] ); + sw( a1.i[6], a6.i[1] ); + sw( a1.i[7], a7.i[1] ); + sw( a2.i[3], a3.i[2] ); + sw( a2.i[4], a4.i[2] ); + sw( a2.i[5], a5.i[2] ); + sw( a2.i[6], a6.i[2] ); + sw( a2.i[7], a7.i[2] ); + sw( a3.i[4], a4.i[3] ); + sw( a3.i[5], a5.i[3] ); + sw( a3.i[6], a6.i[3] ); + sw( a3.i[7], a7.i[3] ); + sw( a4.i[5], a5.i[4] ); + sw( a4.i[6], a6.i[4] ); + sw( a4.i[7], a7.i[4] ); + sw( a5.i[6], a6.i[5] ); + sw( a5.i[7], a7.i[5] ); + sw( a6.i[7], a7.i[6] ); +} + +#undef sw + +// v8 memory manipulation functions + +inline void load_8x1( const void* ALIGNED( 16 ) p, v8& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - a.i[j] = ((const int * ALIGNED(16))p)[j]; - } + for ( int j = 0; j < 8; j++ ) + a.i[j] = ( (const int* ALIGNED( 16 ))p )[j]; +} - inline void store_8x1( const v8 &a, - void * ALIGNED(16) p ) - { +inline void store_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - ((int * ALIGNED(16))p)[j] = a.i[j]; - } + for ( int j = 0; j < 8; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = a.i[j]; +} - inline void stream_8x1( const v8 &a, - void * ALIGNED(16) p ) - { +inline void stream_8x1( const v8& a, void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - ((int * ALIGNED(16))p)[j] = a.i[j]; - } + for ( int j = 0; j < 8; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = a.i[j]; +} - inline void clear_8x1( void * ALIGNED(16) p ) - { +inline void clear_8x1( void* ALIGNED( 16 ) p ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - ((int * ALIGNED(16))p)[j] = 0; - } - - // FIXME: Ordering semantics - inline void copy_8x1( void * ALIGNED(16) dst, - const void * ALIGNED(16) src ) - { + for ( int j = 0; j < 8; j++ ) + ( (int* ALIGNED( 16 ))p )[j] = 0; +} + +// FIXME: Ordering semantics +inline void copy_8x1( void* ALIGNED( 16 ) dst, const void* ALIGNED( 16 ) src ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j]; - } + for ( int j = 0; j < 8; j++ ) + ( (int* ALIGNED( 16 ))dst )[j] = ( (const int* ALIGNED( 16 ))src )[j]; +} - inline void swap_8x1( void * ALIGNED(16) a, - void * ALIGNED(16) b ) - { +inline void swap_8x1( void* ALIGNED( 16 ) a, void* ALIGNED( 16 ) b ) +{ int t; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) + for ( int j = 0; j < 8; j++ ) { - t = ((int * ALIGNED(16))a)[j]; - ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j]; - ((int * ALIGNED(16))b)[j] = t; + t = ( (int* ALIGNED( 16 ))a )[j]; + ( (int* ALIGNED( 16 ))a )[j] = ( (int* ALIGNED( 16 ))b )[j]; + ( (int* ALIGNED( 16 ))b )[j] = t; } - } - - // v8 transposed memory manipulation functions - - inline void load_8x1_tr( const void *a0, const void *a1, - const void *a2, const void *a3, - const void *a4, const void *a5, - const void *a6, const void *a7, - v8 &a ) - { - a.i[0] = ((const int *)a0)[0]; - a.i[1] = ((const int *)a1)[0]; - a.i[2] = ((const int *)a2)[0]; - a.i[3] = ((const int *)a3)[0]; - a.i[4] = ((const int *)a4)[0]; - a.i[5] = ((const int *)a5)[0]; - a.i[6] = ((const int *)a6)[0]; - a.i[7] = ((const int *)a7)[0]; - } - - inline void load_8x2_tr( const void * ALIGNED(8) a0, - const void * ALIGNED(8) a1, - const void * ALIGNED(8) a2, - const void * ALIGNED(8) a3, - const void * ALIGNED(8) a4, - const void * ALIGNED(8) a5, - const void * ALIGNED(8) a6, - const void * ALIGNED(8) a7, - v8 &a, v8 &b ) - { - a.i[0] = ((const int * ALIGNED(8))a0)[0]; - b.i[0] = ((const int * ALIGNED(8))a0)[1]; - - a.i[1] = ((const int * ALIGNED(8))a1)[0]; - b.i[1] = ((const int * ALIGNED(8))a1)[1]; - - a.i[2] = ((const int * ALIGNED(8))a2)[0]; - b.i[2] = ((const int * ALIGNED(8))a2)[1]; - - a.i[3] = ((const int * ALIGNED(8))a3)[0]; - b.i[3] = ((const int * ALIGNED(8))a3)[1]; - - a.i[4] = ((const int * ALIGNED(8))a4)[0]; - b.i[4] = ((const int * ALIGNED(8))a4)[1]; - - a.i[5] = ((const int * ALIGNED(8))a5)[0]; - b.i[5] = ((const int * ALIGNED(8))a5)[1]; - - a.i[6] = ((const int * ALIGNED(8))a6)[0]; - b.i[6] = ((const int * ALIGNED(8))a6)[1]; - - a.i[7] = ((const int * ALIGNED(8))a7)[0]; - b.i[7] = ((const int * ALIGNED(8))a7)[1]; - } - - inline void load_8x3_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - } - - inline void load_8x4_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - d.i[4] = ((const int * ALIGNED(16))a4)[3]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - d.i[5] = ((const int * ALIGNED(16))a5)[3]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - d.i[6] = ((const int * ALIGNED(16))a6)[3]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - d.i[7] = ((const int * ALIGNED(16))a7)[3]; - } - - inline void load_8x8_tr( const void * ALIGNED(16) a0, - const void * ALIGNED(16) a1, - const void * ALIGNED(16) a2, - const void * ALIGNED(16) a3, - const void * ALIGNED(16) a4, - const void * ALIGNED(16) a5, - const void * ALIGNED(16) a6, - const void * ALIGNED(16) a7, - v8 &a, v8 &b, v8 &c, v8 &d, - v8 &e, v8 &f, v8 &g, v8 &h ) - { - a.i[0] = ((const int * ALIGNED(16))a0)[0]; - b.i[0] = ((const int * ALIGNED(16))a0)[1]; - c.i[0] = ((const int * ALIGNED(16))a0)[2]; - d.i[0] = ((const int * ALIGNED(16))a0)[3]; - e.i[0] = ((const int * ALIGNED(16))a0)[4]; - f.i[0] = ((const int * ALIGNED(16))a0)[5]; - g.i[0] = ((const int * ALIGNED(16))a0)[6]; - h.i[0] = ((const int * ALIGNED(16))a0)[7]; - - a.i[1] = ((const int * ALIGNED(16))a1)[0]; - b.i[1] = ((const int * ALIGNED(16))a1)[1]; - c.i[1] = ((const int * ALIGNED(16))a1)[2]; - d.i[1] = ((const int * ALIGNED(16))a1)[3]; - e.i[1] = ((const int * ALIGNED(16))a1)[4]; - f.i[1] = ((const int * ALIGNED(16))a1)[5]; - g.i[1] = ((const int * ALIGNED(16))a1)[6]; - h.i[1] = ((const int * ALIGNED(16))a1)[7]; - - a.i[2] = ((const int * ALIGNED(16))a2)[0]; - b.i[2] = ((const int * ALIGNED(16))a2)[1]; - c.i[2] = ((const int * ALIGNED(16))a2)[2]; - d.i[2] = ((const int * ALIGNED(16))a2)[3]; - e.i[2] = ((const int * ALIGNED(16))a2)[4]; - f.i[2] = ((const int * ALIGNED(16))a2)[5]; - g.i[2] = ((const int * ALIGNED(16))a2)[6]; - h.i[2] = ((const int * ALIGNED(16))a2)[7]; - - a.i[3] = ((const int * ALIGNED(16))a3)[0]; - b.i[3] = ((const int * ALIGNED(16))a3)[1]; - c.i[3] = ((const int * ALIGNED(16))a3)[2]; - d.i[3] = ((const int * ALIGNED(16))a3)[3]; - e.i[3] = ((const int * ALIGNED(16))a3)[4]; - f.i[3] = ((const int * ALIGNED(16))a3)[5]; - g.i[3] = ((const int * ALIGNED(16))a3)[6]; - h.i[3] = ((const int * ALIGNED(16))a3)[7]; - - a.i[4] = ((const int * ALIGNED(16))a4)[0]; - b.i[4] = ((const int * ALIGNED(16))a4)[1]; - c.i[4] = ((const int * ALIGNED(16))a4)[2]; - d.i[4] = ((const int * ALIGNED(16))a4)[3]; - e.i[4] = ((const int * ALIGNED(16))a4)[4]; - f.i[4] = ((const int * ALIGNED(16))a4)[5]; - g.i[4] = ((const int * ALIGNED(16))a4)[6]; - h.i[4] = ((const int * ALIGNED(16))a4)[7]; - - a.i[5] = ((const int * ALIGNED(16))a5)[0]; - b.i[5] = ((const int * ALIGNED(16))a5)[1]; - c.i[5] = ((const int * ALIGNED(16))a5)[2]; - d.i[5] = ((const int * ALIGNED(16))a5)[3]; - e.i[5] = ((const int * ALIGNED(16))a5)[4]; - f.i[5] = ((const int * ALIGNED(16))a5)[5]; - g.i[5] = ((const int * ALIGNED(16))a5)[6]; - h.i[5] = ((const int * ALIGNED(16))a5)[7]; - - a.i[6] = ((const int * ALIGNED(16))a6)[0]; - b.i[6] = ((const int * ALIGNED(16))a6)[1]; - c.i[6] = ((const int * ALIGNED(16))a6)[2]; - d.i[6] = ((const int * ALIGNED(16))a6)[3]; - e.i[6] = ((const int * ALIGNED(16))a6)[4]; - f.i[6] = ((const int * ALIGNED(16))a6)[5]; - g.i[6] = ((const int * ALIGNED(16))a6)[6]; - h.i[6] = ((const int * ALIGNED(16))a6)[7]; - - a.i[7] = ((const int * ALIGNED(16))a7)[0]; - b.i[7] = ((const int * ALIGNED(16))a7)[1]; - c.i[7] = ((const int * ALIGNED(16))a7)[2]; - d.i[7] = ((const int * ALIGNED(16))a7)[3]; - e.i[7] = ((const int * ALIGNED(16))a7)[4]; - f.i[7] = ((const int * ALIGNED(16))a7)[5]; - g.i[7] = ((const int * ALIGNED(16))a7)[6]; - h.i[7] = ((const int * ALIGNED(16))a7)[7]; - } - - inline void store_8x1_tr( const v8 &a, - void *a0, void *a1, void *a2, void *a3, - void *a4, void *a5, void *a6, void *a7 ) - { - ((int *)a0)[0] = a.i[0]; - ((int *)a1)[0] = a.i[1]; - ((int *)a2)[0] = a.i[2]; - ((int *)a3)[0] = a.i[3]; - ((int *)a4)[0] = a.i[4]; - ((int *)a5)[0] = a.i[5]; - ((int *)a6)[0] = a.i[6]; - ((int *)a7)[0] = a.i[7]; - } - - inline void store_8x2_tr( const v8 &a, const v8 &b, - void * ALIGNED(8) a0, void * ALIGNED(8) a1, - void * ALIGNED(8) a2, void * ALIGNED(8) a3, - void * ALIGNED(8) a4, void * ALIGNED(8) a5, - void * ALIGNED(8) a6, void * ALIGNED(8) a7 ) - { - ((int * ALIGNED(8))a0)[0] = a.i[0]; - ((int * ALIGNED(8))a0)[1] = b.i[0]; - - ((int * ALIGNED(8))a1)[0] = a.i[1]; - ((int * ALIGNED(8))a1)[1] = b.i[1]; - - ((int * ALIGNED(8))a2)[0] = a.i[2]; - ((int * ALIGNED(8))a2)[1] = b.i[2]; - - ((int * ALIGNED(8))a3)[0] = a.i[3]; - ((int * ALIGNED(8))a3)[1] = b.i[3]; - - ((int * ALIGNED(8))a4)[0] = a.i[4]; - ((int * ALIGNED(8))a4)[1] = b.i[4]; - - ((int * ALIGNED(8))a5)[0] = a.i[5]; - ((int * ALIGNED(8))a5)[1] = b.i[5]; - - ((int * ALIGNED(8))a6)[0] = a.i[6]; - ((int * ALIGNED(8))a6)[1] = b.i[6]; - - ((int * ALIGNED(8))a7)[0] = a.i[7]; - ((int * ALIGNED(8))a7)[1] = b.i[7]; - } - - inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - } - - inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - ((int * ALIGNED(16))a4)[3] = d.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - ((int * ALIGNED(16))a5)[3] = d.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - ((int * ALIGNED(16))a6)[3] = d.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - ((int * ALIGNED(16))a7)[3] = d.i[7]; - } - - inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d, - const v8 &e, const v8 &f, const v8 &g, const v8 &h, - void * ALIGNED(16) a0, void * ALIGNED(16) a1, - void * ALIGNED(16) a2, void * ALIGNED(16) a3, - void * ALIGNED(16) a4, void * ALIGNED(16) a5, - void * ALIGNED(16) a6, void * ALIGNED(16) a7 ) - { - ((int * ALIGNED(16))a0)[0] = a.i[0]; - ((int * ALIGNED(16))a0)[1] = b.i[0]; - ((int * ALIGNED(16))a0)[2] = c.i[0]; - ((int * ALIGNED(16))a0)[3] = d.i[0]; - ((int * ALIGNED(16))a0)[4] = e.i[0]; - ((int * ALIGNED(16))a0)[5] = f.i[0]; - ((int * ALIGNED(16))a0)[6] = g.i[0]; - ((int * ALIGNED(16))a0)[7] = h.i[0]; - - ((int * ALIGNED(16))a1)[0] = a.i[1]; - ((int * ALIGNED(16))a1)[1] = b.i[1]; - ((int * ALIGNED(16))a1)[2] = c.i[1]; - ((int * ALIGNED(16))a1)[3] = d.i[1]; - ((int * ALIGNED(16))a1)[4] = e.i[1]; - ((int * ALIGNED(16))a1)[5] = f.i[1]; - ((int * ALIGNED(16))a1)[6] = g.i[1]; - ((int * ALIGNED(16))a1)[7] = h.i[1]; - - ((int * ALIGNED(16))a2)[0] = a.i[2]; - ((int * ALIGNED(16))a2)[1] = b.i[2]; - ((int * ALIGNED(16))a2)[2] = c.i[2]; - ((int * ALIGNED(16))a2)[3] = d.i[2]; - ((int * ALIGNED(16))a2)[4] = e.i[2]; - ((int * ALIGNED(16))a2)[5] = f.i[2]; - ((int * ALIGNED(16))a2)[6] = g.i[2]; - ((int * ALIGNED(16))a2)[7] = h.i[2]; - - ((int * ALIGNED(16))a3)[0] = a.i[3]; - ((int * ALIGNED(16))a3)[1] = b.i[3]; - ((int * ALIGNED(16))a3)[2] = c.i[3]; - ((int * ALIGNED(16))a3)[3] = d.i[3]; - ((int * ALIGNED(16))a3)[4] = e.i[3]; - ((int * ALIGNED(16))a3)[5] = f.i[3]; - ((int * ALIGNED(16))a3)[6] = g.i[3]; - ((int * ALIGNED(16))a3)[7] = h.i[3]; - - ((int * ALIGNED(16))a4)[0] = a.i[4]; - ((int * ALIGNED(16))a4)[1] = b.i[4]; - ((int * ALIGNED(16))a4)[2] = c.i[4]; - ((int * ALIGNED(16))a4)[3] = d.i[4]; - ((int * ALIGNED(16))a4)[4] = e.i[4]; - ((int * ALIGNED(16))a4)[5] = f.i[4]; - ((int * ALIGNED(16))a4)[6] = g.i[4]; - ((int * ALIGNED(16))a4)[7] = h.i[4]; - - ((int * ALIGNED(16))a5)[0] = a.i[5]; - ((int * ALIGNED(16))a5)[1] = b.i[5]; - ((int * ALIGNED(16))a5)[2] = c.i[5]; - ((int * ALIGNED(16))a5)[3] = d.i[5]; - ((int * ALIGNED(16))a5)[4] = e.i[5]; - ((int * ALIGNED(16))a5)[5] = f.i[5]; - ((int * ALIGNED(16))a5)[6] = g.i[5]; - ((int * ALIGNED(16))a5)[7] = h.i[5]; - - ((int * ALIGNED(16))a6)[0] = a.i[6]; - ((int * ALIGNED(16))a6)[1] = b.i[6]; - ((int * ALIGNED(16))a6)[2] = c.i[6]; - ((int * ALIGNED(16))a6)[3] = d.i[6]; - ((int * ALIGNED(16))a6)[4] = e.i[6]; - ((int * ALIGNED(16))a6)[5] = f.i[6]; - ((int * ALIGNED(16))a6)[6] = g.i[6]; - ((int * ALIGNED(16))a6)[7] = h.i[6]; - - ((int * ALIGNED(16))a7)[0] = a.i[7]; - ((int * ALIGNED(16))a7)[1] = b.i[7]; - ((int * ALIGNED(16))a7)[2] = c.i[7]; - ((int * ALIGNED(16))a7)[3] = d.i[7]; - ((int * ALIGNED(16))a7)[4] = e.i[7]; - ((int * ALIGNED(16))a7)[5] = f.i[7]; - ((int * ALIGNED(16))a7)[6] = g.i[7]; - ((int * ALIGNED(16))a7)[7] = h.i[7]; - } - - ////////////// - // v8int class - - class v8int : public v8 - { +} + +// v8 transposed memory manipulation functions + +inline void load_8x1_tr( const void* a0, const void* a1, const void* a2, + const void* a3, const void* a4, const void* a5, + const void* a6, const void* a7, v8& a ) +{ + a.i[0] = ( (const int*)a0 )[0]; + a.i[1] = ( (const int*)a1 )[0]; + a.i[2] = ( (const int*)a2 )[0]; + a.i[3] = ( (const int*)a3 )[0]; + a.i[4] = ( (const int*)a4 )[0]; + a.i[5] = ( (const int*)a5 )[0]; + a.i[6] = ( (const int*)a6 )[0]; + a.i[7] = ( (const int*)a7 )[0]; +} + +inline void +load_8x2_tr( const void* ALIGNED( 8 ) a0, const void* ALIGNED( 8 ) a1, + const void* ALIGNED( 8 ) a2, const void* ALIGNED( 8 ) a3, + const void* ALIGNED( 8 ) a4, const void* ALIGNED( 8 ) a5, + const void* ALIGNED( 8 ) a6, const void* ALIGNED( 8 ) a7, v8& a, + v8& b ) +{ + a.i[0] = ( (const int* ALIGNED( 8 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 8 ))a0 )[1]; + + a.i[1] = ( (const int* ALIGNED( 8 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 8 ))a1 )[1]; + + a.i[2] = ( (const int* ALIGNED( 8 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 8 ))a2 )[1]; + + a.i[3] = ( (const int* ALIGNED( 8 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 8 ))a3 )[1]; + + a.i[4] = ( (const int* ALIGNED( 8 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 8 ))a4 )[1]; + + a.i[5] = ( (const int* ALIGNED( 8 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 8 ))a5 )[1]; + + a.i[6] = ( (const int* ALIGNED( 8 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 8 ))a6 )[1]; + + a.i[7] = ( (const int* ALIGNED( 8 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 8 ))a7 )[1]; +} + +inline void +load_8x3_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; +} + +inline void +load_8x4_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + d.i[4] = ( (const int* ALIGNED( 16 ))a4 )[3]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + d.i[5] = ( (const int* ALIGNED( 16 ))a5 )[3]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + d.i[6] = ( (const int* ALIGNED( 16 ))a6 )[3]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; + d.i[7] = ( (const int* ALIGNED( 16 ))a7 )[3]; +} + +inline void +load_8x8_tr( const void* ALIGNED( 16 ) a0, const void* ALIGNED( 16 ) a1, + const void* ALIGNED( 16 ) a2, const void* ALIGNED( 16 ) a3, + const void* ALIGNED( 16 ) a4, const void* ALIGNED( 16 ) a5, + const void* ALIGNED( 16 ) a6, const void* ALIGNED( 16 ) a7, v8& a, + v8& b, v8& c, v8& d, v8& e, v8& f, v8& g, v8& h ) +{ + a.i[0] = ( (const int* ALIGNED( 16 ))a0 )[0]; + b.i[0] = ( (const int* ALIGNED( 16 ))a0 )[1]; + c.i[0] = ( (const int* ALIGNED( 16 ))a0 )[2]; + d.i[0] = ( (const int* ALIGNED( 16 ))a0 )[3]; + e.i[0] = ( (const int* ALIGNED( 16 ))a0 )[4]; + f.i[0] = ( (const int* ALIGNED( 16 ))a0 )[5]; + g.i[0] = ( (const int* ALIGNED( 16 ))a0 )[6]; + h.i[0] = ( (const int* ALIGNED( 16 ))a0 )[7]; + + a.i[1] = ( (const int* ALIGNED( 16 ))a1 )[0]; + b.i[1] = ( (const int* ALIGNED( 16 ))a1 )[1]; + c.i[1] = ( (const int* ALIGNED( 16 ))a1 )[2]; + d.i[1] = ( (const int* ALIGNED( 16 ))a1 )[3]; + e.i[1] = ( (const int* ALIGNED( 16 ))a1 )[4]; + f.i[1] = ( (const int* ALIGNED( 16 ))a1 )[5]; + g.i[1] = ( (const int* ALIGNED( 16 ))a1 )[6]; + h.i[1] = ( (const int* ALIGNED( 16 ))a1 )[7]; + + a.i[2] = ( (const int* ALIGNED( 16 ))a2 )[0]; + b.i[2] = ( (const int* ALIGNED( 16 ))a2 )[1]; + c.i[2] = ( (const int* ALIGNED( 16 ))a2 )[2]; + d.i[2] = ( (const int* ALIGNED( 16 ))a2 )[3]; + e.i[2] = ( (const int* ALIGNED( 16 ))a2 )[4]; + f.i[2] = ( (const int* ALIGNED( 16 ))a2 )[5]; + g.i[2] = ( (const int* ALIGNED( 16 ))a2 )[6]; + h.i[2] = ( (const int* ALIGNED( 16 ))a2 )[7]; + + a.i[3] = ( (const int* ALIGNED( 16 ))a3 )[0]; + b.i[3] = ( (const int* ALIGNED( 16 ))a3 )[1]; + c.i[3] = ( (const int* ALIGNED( 16 ))a3 )[2]; + d.i[3] = ( (const int* ALIGNED( 16 ))a3 )[3]; + e.i[3] = ( (const int* ALIGNED( 16 ))a3 )[4]; + f.i[3] = ( (const int* ALIGNED( 16 ))a3 )[5]; + g.i[3] = ( (const int* ALIGNED( 16 ))a3 )[6]; + h.i[3] = ( (const int* ALIGNED( 16 ))a3 )[7]; + + a.i[4] = ( (const int* ALIGNED( 16 ))a4 )[0]; + b.i[4] = ( (const int* ALIGNED( 16 ))a4 )[1]; + c.i[4] = ( (const int* ALIGNED( 16 ))a4 )[2]; + d.i[4] = ( (const int* ALIGNED( 16 ))a4 )[3]; + e.i[4] = ( (const int* ALIGNED( 16 ))a4 )[4]; + f.i[4] = ( (const int* ALIGNED( 16 ))a4 )[5]; + g.i[4] = ( (const int* ALIGNED( 16 ))a4 )[6]; + h.i[4] = ( (const int* ALIGNED( 16 ))a4 )[7]; + + a.i[5] = ( (const int* ALIGNED( 16 ))a5 )[0]; + b.i[5] = ( (const int* ALIGNED( 16 ))a5 )[1]; + c.i[5] = ( (const int* ALIGNED( 16 ))a5 )[2]; + d.i[5] = ( (const int* ALIGNED( 16 ))a5 )[3]; + e.i[5] = ( (const int* ALIGNED( 16 ))a5 )[4]; + f.i[5] = ( (const int* ALIGNED( 16 ))a5 )[5]; + g.i[5] = ( (const int* ALIGNED( 16 ))a5 )[6]; + h.i[5] = ( (const int* ALIGNED( 16 ))a5 )[7]; + + a.i[6] = ( (const int* ALIGNED( 16 ))a6 )[0]; + b.i[6] = ( (const int* ALIGNED( 16 ))a6 )[1]; + c.i[6] = ( (const int* ALIGNED( 16 ))a6 )[2]; + d.i[6] = ( (const int* ALIGNED( 16 ))a6 )[3]; + e.i[6] = ( (const int* ALIGNED( 16 ))a6 )[4]; + f.i[6] = ( (const int* ALIGNED( 16 ))a6 )[5]; + g.i[6] = ( (const int* ALIGNED( 16 ))a6 )[6]; + h.i[6] = ( (const int* ALIGNED( 16 ))a6 )[7]; + + a.i[7] = ( (const int* ALIGNED( 16 ))a7 )[0]; + b.i[7] = ( (const int* ALIGNED( 16 ))a7 )[1]; + c.i[7] = ( (const int* ALIGNED( 16 ))a7 )[2]; + d.i[7] = ( (const int* ALIGNED( 16 ))a7 )[3]; + e.i[7] = ( (const int* ALIGNED( 16 ))a7 )[4]; + f.i[7] = ( (const int* ALIGNED( 16 ))a7 )[5]; + g.i[7] = ( (const int* ALIGNED( 16 ))a7 )[6]; + h.i[7] = ( (const int* ALIGNED( 16 ))a7 )[7]; +} + +inline void store_8x1_tr( const v8& a, void* a0, void* a1, void* a2, void* a3, + void* a4, void* a5, void* a6, void* a7 ) +{ + ( (int*)a0 )[0] = a.i[0]; + ( (int*)a1 )[0] = a.i[1]; + ( (int*)a2 )[0] = a.i[2]; + ( (int*)a3 )[0] = a.i[3]; + ( (int*)a4 )[0] = a.i[4]; + ( (int*)a5 )[0] = a.i[5]; + ( (int*)a6 )[0] = a.i[6]; + ( (int*)a7 )[0] = a.i[7]; +} + +inline void store_8x2_tr( const v8& a, const v8& b, void* ALIGNED( 8 ) a0, + void* ALIGNED( 8 ) a1, void* ALIGNED( 8 ) a2, + void* ALIGNED( 8 ) a3, void* ALIGNED( 8 ) a4, + void* ALIGNED( 8 ) a5, void* ALIGNED( 8 ) a6, + void* ALIGNED( 8 ) a7 ) +{ + ( (int* ALIGNED( 8 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 8 ))a0 )[1] = b.i[0]; + + ( (int* ALIGNED( 8 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 8 ))a1 )[1] = b.i[1]; + + ( (int* ALIGNED( 8 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 8 ))a2 )[1] = b.i[2]; + + ( (int* ALIGNED( 8 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 8 ))a3 )[1] = b.i[3]; + + ( (int* ALIGNED( 8 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 8 ))a4 )[1] = b.i[4]; + + ( (int* ALIGNED( 8 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 8 ))a5 )[1] = b.i[5]; + + ( (int* ALIGNED( 8 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 8 ))a6 )[1] = b.i[6]; + + ( (int* ALIGNED( 8 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 8 ))a7 )[1] = b.i[7]; +} + +inline void store_8x3_tr( const v8& a, const v8& b, const v8& c, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; +} + +inline void store_8x4_tr( const v8& a, const v8& b, const v8& c, const v8& d, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + ( (int* ALIGNED( 16 ))a4 )[3] = d.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + ( (int* ALIGNED( 16 ))a5 )[3] = d.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + ( (int* ALIGNED( 16 ))a6 )[3] = d.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; + ( (int* ALIGNED( 16 ))a7 )[3] = d.i[7]; +} + +inline void store_8x8_tr( const v8& a, const v8& b, const v8& c, const v8& d, + const v8& e, const v8& f, const v8& g, const v8& h, + void* ALIGNED( 16 ) a0, void* ALIGNED( 16 ) a1, + void* ALIGNED( 16 ) a2, void* ALIGNED( 16 ) a3, + void* ALIGNED( 16 ) a4, void* ALIGNED( 16 ) a5, + void* ALIGNED( 16 ) a6, void* ALIGNED( 16 ) a7 ) +{ + ( (int* ALIGNED( 16 ))a0 )[0] = a.i[0]; + ( (int* ALIGNED( 16 ))a0 )[1] = b.i[0]; + ( (int* ALIGNED( 16 ))a0 )[2] = c.i[0]; + ( (int* ALIGNED( 16 ))a0 )[3] = d.i[0]; + ( (int* ALIGNED( 16 ))a0 )[4] = e.i[0]; + ( (int* ALIGNED( 16 ))a0 )[5] = f.i[0]; + ( (int* ALIGNED( 16 ))a0 )[6] = g.i[0]; + ( (int* ALIGNED( 16 ))a0 )[7] = h.i[0]; + + ( (int* ALIGNED( 16 ))a1 )[0] = a.i[1]; + ( (int* ALIGNED( 16 ))a1 )[1] = b.i[1]; + ( (int* ALIGNED( 16 ))a1 )[2] = c.i[1]; + ( (int* ALIGNED( 16 ))a1 )[3] = d.i[1]; + ( (int* ALIGNED( 16 ))a1 )[4] = e.i[1]; + ( (int* ALIGNED( 16 ))a1 )[5] = f.i[1]; + ( (int* ALIGNED( 16 ))a1 )[6] = g.i[1]; + ( (int* ALIGNED( 16 ))a1 )[7] = h.i[1]; + + ( (int* ALIGNED( 16 ))a2 )[0] = a.i[2]; + ( (int* ALIGNED( 16 ))a2 )[1] = b.i[2]; + ( (int* ALIGNED( 16 ))a2 )[2] = c.i[2]; + ( (int* ALIGNED( 16 ))a2 )[3] = d.i[2]; + ( (int* ALIGNED( 16 ))a2 )[4] = e.i[2]; + ( (int* ALIGNED( 16 ))a2 )[5] = f.i[2]; + ( (int* ALIGNED( 16 ))a2 )[6] = g.i[2]; + ( (int* ALIGNED( 16 ))a2 )[7] = h.i[2]; + + ( (int* ALIGNED( 16 ))a3 )[0] = a.i[3]; + ( (int* ALIGNED( 16 ))a3 )[1] = b.i[3]; + ( (int* ALIGNED( 16 ))a3 )[2] = c.i[3]; + ( (int* ALIGNED( 16 ))a3 )[3] = d.i[3]; + ( (int* ALIGNED( 16 ))a3 )[4] = e.i[3]; + ( (int* ALIGNED( 16 ))a3 )[5] = f.i[3]; + ( (int* ALIGNED( 16 ))a3 )[6] = g.i[3]; + ( (int* ALIGNED( 16 ))a3 )[7] = h.i[3]; + + ( (int* ALIGNED( 16 ))a4 )[0] = a.i[4]; + ( (int* ALIGNED( 16 ))a4 )[1] = b.i[4]; + ( (int* ALIGNED( 16 ))a4 )[2] = c.i[4]; + ( (int* ALIGNED( 16 ))a4 )[3] = d.i[4]; + ( (int* ALIGNED( 16 ))a4 )[4] = e.i[4]; + ( (int* ALIGNED( 16 ))a4 )[5] = f.i[4]; + ( (int* ALIGNED( 16 ))a4 )[6] = g.i[4]; + ( (int* ALIGNED( 16 ))a4 )[7] = h.i[4]; + + ( (int* ALIGNED( 16 ))a5 )[0] = a.i[5]; + ( (int* ALIGNED( 16 ))a5 )[1] = b.i[5]; + ( (int* ALIGNED( 16 ))a5 )[2] = c.i[5]; + ( (int* ALIGNED( 16 ))a5 )[3] = d.i[5]; + ( (int* ALIGNED( 16 ))a5 )[4] = e.i[5]; + ( (int* ALIGNED( 16 ))a5 )[5] = f.i[5]; + ( (int* ALIGNED( 16 ))a5 )[6] = g.i[5]; + ( (int* ALIGNED( 16 ))a5 )[7] = h.i[5]; + + ( (int* ALIGNED( 16 ))a6 )[0] = a.i[6]; + ( (int* ALIGNED( 16 ))a6 )[1] = b.i[6]; + ( (int* ALIGNED( 16 ))a6 )[2] = c.i[6]; + ( (int* ALIGNED( 16 ))a6 )[3] = d.i[6]; + ( (int* ALIGNED( 16 ))a6 )[4] = e.i[6]; + ( (int* ALIGNED( 16 ))a6 )[5] = f.i[6]; + ( (int* ALIGNED( 16 ))a6 )[6] = g.i[6]; + ( (int* ALIGNED( 16 ))a6 )[7] = h.i[6]; + + ( (int* ALIGNED( 16 ))a7 )[0] = a.i[7]; + ( (int* ALIGNED( 16 ))a7 )[1] = b.i[7]; + ( (int* ALIGNED( 16 ))a7 )[2] = c.i[7]; + ( (int* ALIGNED( 16 ))a7 )[3] = d.i[7]; + ( (int* ALIGNED( 16 ))a7 )[4] = e.i[7]; + ( (int* ALIGNED( 16 ))a7 )[5] = f.i[7]; + ( (int* ALIGNED( 16 ))a7 )[6] = g.i[7]; + ( (int* ALIGNED( 16 ))a7 )[7] = h.i[7]; +} + +////////////// +// v8int class + +class v8int : public v8 +{ // v8int prefix unary operator friends - friend inline v8int operator +( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator ~( const v8int & a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8int & a ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator~( const v8int& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8int& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8int prefix increment / decrement operator friends - friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a ) ALWAYS_INLINE; // v8int postfix increment / decrement operator friends - friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE; - friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE; + friend inline v8int operator++( v8int& a, int ) ALWAYS_INLINE; + friend inline v8int operator--( v8int& a, int ) ALWAYS_INLINE; // v8int binary operator friends - friend inline v8int operator +( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator -( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator *( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator /( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator %( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ^( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator |( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator+( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator-( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator*(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator/( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator%( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator^( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&(const v8int& a, const v8int& b)ALWAYS_INLINE; + friend inline v8int operator|( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int logical operator friends - friend inline v8int operator <( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8int& a, + const v8int& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8int& a, + const v8int& b ) ALWAYS_INLINE; // v8int miscellaneous friends - friend inline v8int abs( const v8int &a ) ALWAYS_INLINE; - friend inline v8 czero( const v8int &c, const v8 &a ) ALWAYS_INLINE; - friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE; + friend inline v8int abs( const v8int& a ) ALWAYS_INLINE; + friend inline v8 czero( const v8int& c, const v8& a ) ALWAYS_INLINE; + friend inline v8 notczero( const v8int& c, const v8& a ) ALWAYS_INLINE; // FIXME: cswap, notcswap! - friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE; + friend inline v8 merge( const v8int& c, const v8& t, + const v8& f ) ALWAYS_INLINE; // v8float unary operator friends - friend inline v8int operator !( const v8float & a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float miscellaneous friends - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; public: - // v8int constructors / destructors - v8int() {} // Default constructor + v8int() {} // Default constructor - v8int( const v8int &a ) // Copy constructor + v8int( const v8int& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + i[j] = a.i[j]; } - v8int( const v8 &a ) // Init from mixed + v8int( const v8& a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - i[j] = a.i[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + i[j] = a.i[j]; } - v8int( int a ) // Init from scalar + v8int( int a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - i[j] = a; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + i[j] = a; } - v8int( int i0, int i1, int i2, int i3, // Init from scalars - int i4, int i5, int i6, int i7 ) + v8int( int i0, int i1, int i2, int i3, // Init from scalars + int i4, int i5, int i6, int i7 ) { - i[0] = i0; - i[1] = i1; - i[2] = i2; - i[3] = i3; - i[4] = i4; - i[5] = i5; - i[6] = i6; - i[7] = i7; + i[0] = i0; + i[1] = i1; + i[2] = i2; + i[3] = i3; + i[4] = i4; + i[5] = i5; + i[6] = i6; + i[7] = i7; } - ~v8int() {} // Destructor + ~v8int() {} // Destructor // v8int assignment operators -# define ASSIGN(op) \ - inline v8int &operator op( const v8int &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - i[j] op b.i[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8int& operator op( const v8int& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + i[j] op b.i[j]; \ + return *this; \ } - ASSIGN( =) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) - ASSIGN(%=) - ASSIGN(^=) - ASSIGN(&=) - ASSIGN(|=) - ASSIGN(<<=) - ASSIGN(>>=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) + ASSIGN( %= ) + ASSIGN( ^= ) + ASSIGN( &= ) + ASSIGN( |= ) + ASSIGN( <<= ) + ASSIGN( >>= ) -# undef ASSIGN +#undef ASSIGN // v8int member access operator - inline int &operator []( int n ) - { - return i[n]; - } + inline int& operator[]( int n ) { return i[n]; } - inline int operator ()( int n ) - { - return i[n]; - } - }; + inline int operator()( int n ) { return i[n]; } +}; - // v8int prefix unary operators +// v8int prefix unary operators -# define PREFIX_UNARY(op) \ - inline v8int operator op( const v8int & a ) \ - { \ - v8int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } +#define PREFIX_UNARY( op ) \ + inline v8int operator op( const v8int& a ) \ + { \ + v8int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } - PREFIX_UNARY(+) - PREFIX_UNARY(-) +PREFIX_UNARY( +) +PREFIX_UNARY( -) - inline v8int operator !( const v8int & a ) - { +inline v8int operator!( const v8int& a ) +{ v8int b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = - ( !a.i[j] ); + for ( int j = 0; j < 8; j++ ) + b.i[j] = -( !a.i[j] ); return b; - } - - PREFIX_UNARY(~) - -# undef PREFIX_UNARY - - // v8int prefix increment / decrement - -# define PREFIX_INCDEC(op) \ - inline v8int operator op( v8int & a ) \ - { \ - v8int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - b.i[j] = ( op a.i[j] ); \ - return b; \ - } - - PREFIX_INCDEC(++) - PREFIX_INCDEC(--) - -# undef PREFIX_INCDEC - - // v8int postfix increment / decrement - -# define POSTFIX_INCDEC(op) \ - inline v8int operator op( v8int & a, int ) \ - { \ - v8int b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - b.i[j] = ( a.i[j] op ); \ - return b; \ - } - - POSTFIX_INCDEC(++) - POSTFIX_INCDEC(--) - -# undef POSTFIX_INCDEC - - // v8int binary operators - -# define BINARY(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - c.i[j] = a.i[j] op b.i[j]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - BINARY(%) - BINARY(^) - BINARY(&) - BINARY(|) - BINARY(<<) - BINARY(>>) - -# undef BINARY - - // v8int logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8int &a, const v8int &b ) \ - { \ - v8int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - c.i[j] = - ( a.i[j] op b.i[j] ); \ - return c; \ - } - - LOGICAL(<) - LOGICAL(>) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v8int miscellaneous functions - - inline v8int abs( const v8int &a ) - { +} + +PREFIX_UNARY( ~) + +#undef PREFIX_UNARY + +// v8int prefix increment / decrement + +#define PREFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a ) \ + { \ + v8int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + b.i[j] = ( op a.i[j] ); \ + return b; \ + } + +PREFIX_INCDEC( ++) +PREFIX_INCDEC( --) + +#undef PREFIX_INCDEC + +// v8int postfix increment / decrement + +#define POSTFIX_INCDEC( op ) \ + inline v8int operator op( v8int& a, int ) \ + { \ + v8int b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + b.i[j] = ( a.i[j] op ); \ + return b; \ + } + +POSTFIX_INCDEC( ++) +POSTFIX_INCDEC( --) + +#undef POSTFIX_INCDEC + +// v8int binary operators + +#define BINARY( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + c.i[j] = a.i[j] op b.i[j]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) +BINARY( % ) +BINARY( ^) +BINARY( & ) +BINARY( | ) +BINARY( << ) +BINARY( >> ) + +#undef BINARY + +// v8int logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8int& a, const v8int& b ) \ + { \ + v8int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + c.i[j] = -( a.i[j] op b.i[j] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v8int miscellaneous functions + +inline v8int abs( const v8int& a ) +{ v8int b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j]; return b; - } +} - inline v8 czero( const v8int &c, const v8 &a ) - { +inline v8 czero( const v8int& c, const v8& a ) +{ v8 b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = a.i[j] & ~c.i[j]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = a.i[j] & ~c.i[j]; return b; - } +} - inline v8 notczero( const v8int &c, const v8 &a ) - { +inline v8 notczero( const v8int& c, const v8& a ) +{ v8 b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = a.i[j] & c.i[j]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = a.i[j] & c.i[j]; return b; - } +} - inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) - { +inline v8 merge( const v8int& c, const v8& t, const v8& f ) +{ v8 m; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); + for ( int j = 0; j < 8; j++ ) + m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] ); return m; - } +} - //////////////// - // v8float class +//////////////// +// v8float class - class v8float : public v8 - { +class v8float : public v8 +{ // v8float prefix unary operator friends - friend inline v8float operator +( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a ) ALWAYS_INLINE; - friend inline v8float operator ~( const v8float &a ) ALWAYS_INLINE; - friend inline v8int operator !( const v8float &a ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a ) ALWAYS_INLINE; + friend inline v8float operator~( const v8float& a ) ALWAYS_INLINE; + friend inline v8int operator!( const v8float& a ) ALWAYS_INLINE; // Note: Referencing (*) and dereferencing (&) apply to the whole vector // v8float prefix increment / decrement operator friends - friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a ) ALWAYS_INLINE; // v8float postfix increment / decrement operator friends - friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE; - friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE; + friend inline v8float operator++( v8float& a, int ) ALWAYS_INLINE; + friend inline v8float operator--( v8float& a, int ) ALWAYS_INLINE; // v8float binary operator friends - friend inline v8float operator +( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator -( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator *( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8float operator /( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8float operator+( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator-( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8float operator*(const v8float& a, + const v8float& b)ALWAYS_INLINE; + friend inline v8float operator/( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float logical operator friends - friend inline v8int operator <( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE; - friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE; + friend inline v8int operator<( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator==( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator!=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator<=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator>=( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator&&( const v8float& a, + const v8float& b ) ALWAYS_INLINE; + friend inline v8int operator||( const v8float& a, + const v8float& b ) ALWAYS_INLINE; // v8float math library friends -# define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE -# define CMATH_FR2(fn) friend inline v8float fn( const v8float &a, \ - const v8float &b ) ALWAYS_INLINE - - CMATH_FR1(acos); CMATH_FR1(asin); CMATH_FR1(atan); CMATH_FR2(atan2); - CMATH_FR1(ceil); CMATH_FR1(cos); CMATH_FR1(cosh); CMATH_FR1(exp); - CMATH_FR1(fabs); CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log); - CMATH_FR1(log10); CMATH_FR2(pow); CMATH_FR1(sin); CMATH_FR1(sinh); - CMATH_FR1(sqrt); CMATH_FR1(tan); CMATH_FR1(tanh); - - CMATH_FR2(copysign); - -# undef CMATH_FR1 -# undef CMATH_FR2 +#define CMATH_FR1( fn ) \ + friend inline v8float fn( const v8float& a ) ALWAYS_INLINE +#define CMATH_FR2( fn ) \ + friend inline v8float fn( const v8float& a, const v8float& b ) ALWAYS_INLINE + + CMATH_FR1( acos ); + CMATH_FR1( asin ); + CMATH_FR1( atan ); + CMATH_FR2( atan2 ); + CMATH_FR1( ceil ); + CMATH_FR1( cos ); + CMATH_FR1( cosh ); + CMATH_FR1( exp ); + CMATH_FR1( fabs ); + CMATH_FR1( floor ); + CMATH_FR2( fmod ); + CMATH_FR1( log ); + CMATH_FR1( log10 ); + CMATH_FR2( pow ); + CMATH_FR1( sin ); + CMATH_FR1( sinh ); + CMATH_FR1( sqrt ); + CMATH_FR1( tan ); + CMATH_FR1( tanh ); + + CMATH_FR2( copysign ); + +#undef CMATH_FR1 +#undef CMATH_FR2 // v8float miscellaneous friends - friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rsqrt ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE; - friend inline v8float rcp ( const v8float &a ) ALWAYS_INLINE; - friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE; - friend inline v8float clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE; - friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; - friend inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE; + friend inline v8float rsqrt_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rsqrt( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp_approx( const v8float& a ) ALWAYS_INLINE; + friend inline v8float rcp( const v8float& a ) ALWAYS_INLINE; + friend inline v8float fma( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float fnms( const v8float& a, const v8float& b, + const v8float& c ) ALWAYS_INLINE; + friend inline v8float clear_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float set_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline v8float toggle_bits( const v8int& m, + const v8float& a ) ALWAYS_INLINE; + friend inline void increment_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void decrement_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; + friend inline void scale_8x1( float* ALIGNED( 16 ) p, + const v8float& a ) ALWAYS_INLINE; public: - // v8float constructors / destructors - v8float() {} // Default constructor + v8float() {} // Default constructor - v8float( const v8float &a ) // Copy constructor + v8float( const v8float& a ) // Copy constructor { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - f[j] = a.f[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + f[j] = a.f[j]; } - v8float( const v8 &a ) // Init from mixed + v8float( const v8& a ) // Init from mixed { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - f[j] = a.f[j]; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + f[j] = a.f[j]; } - v8float( float a ) // Init from scalar + v8float( float a ) // Init from scalar { - ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - f[j] = a; + ALWAYS_VECTORIZE + for ( int j = 0; j < 8; j++ ) + f[j] = a; } - v8float( float f0, float f1, float f2, float f3, - float f4, float f5, float f6, float f7 ) // Init from scalars + v8float( float f0, float f1, float f2, float f3, float f4, float f5, + float f6, float f7 ) // Init from scalars { - f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3; - f[4] = f4; f[5] = f5; f[6] = f6; f[7] = f7; + f[0] = f0; + f[1] = f1; + f[2] = f2; + f[3] = f3; + f[4] = f4; + f[5] = f5; + f[6] = f6; + f[7] = f7; } - ~v8float() {} // Destructor + ~v8float() {} // Destructor // v8float assignment operators -# define ASSIGN(op) \ - inline v8float &operator op( const v8float &b ) \ - { \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - f[j] op b.f[j]; \ - return *this; \ +#define ASSIGN( op ) \ + inline v8float& operator op( const v8float& b ) \ + { \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + f[j] op b.f[j]; \ + return *this; \ } - ASSIGN(=) - ASSIGN(+=) - ASSIGN(-=) - ASSIGN(*=) - ASSIGN(/=) + ASSIGN( = ) + ASSIGN( += ) + ASSIGN( -= ) + ASSIGN( *= ) + ASSIGN( /= ) -# undef ASSIGN +#undef ASSIGN // v8float member access operator - inline float &operator []( int n ) - { - return f[n]; - } + inline float& operator[]( int n ) { return f[n]; } - inline float operator ()( int n ) - { - return f[n]; - } - }; + inline float operator()( int n ) { return f[n]; } +}; - // v8float prefix unary operators +// v8float prefix unary operators - inline v8float operator +( const v8float &a ) - { +inline v8float operator+( const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = +a.f[j]; + for ( int j = 0; j < 8; j++ ) + b.f[j] = +a.f[j]; return b; - } +} - inline v8float operator -( const v8float &a ) - { +inline v8float operator-( const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = -a.f[j]; + for ( int j = 0; j < 8; j++ ) + b.f[j] = -a.f[j]; return b; - } +} - inline v8int operator !( const v8float &a ) - { +inline v8int operator!( const v8float& a ) +{ v8int b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = a.i[j] ? 0 : -1; + for ( int j = 0; j < 8; j++ ) + b.i[j] = a.i[j] ? 0 : -1; return b; - } +} - // v8float prefix increment / decrement operators +// v8float prefix increment / decrement operators - inline v8float operator ++( v8float &a ) - { +inline v8float operator++( v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = ++a.f[j]; + for ( int j = 0; j < 8; j++ ) + b.f[j] = ++a.f[j]; return b; - } +} - inline v8float operator --( v8float &a ) - { +inline v8float operator--( v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = --a.f[j]; + for ( int j = 0; j < 8; j++ ) + b.f[j] = --a.f[j]; return b; - } +} - // v8float postfix increment / decrement operators +// v8float postfix increment / decrement operators - inline v8float operator ++( v8float &a, int ) - { +inline v8float operator++( v8float& a, int ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = a.f[j]++; + for ( int j = 0; j < 8; j++ ) + b.f[j] = a.f[j]++; return b; - } +} - inline v8float operator --( v8float &a, int ) - { +inline v8float operator--( v8float& a, int ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = a.f[j]--; + for ( int j = 0; j < 8; j++ ) + b.f[j] = a.f[j]--; return b; - } - - // v8float binary operators - -# define BINARY(op) \ - inline v8float operator op( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - c.f[j] = a.f[j] op b.f[j]; \ - return c; \ - } - - BINARY(+) - BINARY(-) - BINARY(*) - BINARY(/) - -# undef BINARY - - // v8float logical operators - -# define LOGICAL(op) \ - inline v8int operator op( const v8float &a, const v8float &b ) \ - { \ - v8int c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - c.i[j] = - ( a.f[j] op b.f[j] ); \ - return c; \ - } - - LOGICAL(< ) - LOGICAL(> ) - LOGICAL(==) - LOGICAL(!=) - LOGICAL(<=) - LOGICAL(>=) - LOGICAL(&&) - LOGICAL(||) - -# undef LOGICAL - - // v8float math library functions - -# define CMATH_FR1(fn) \ - inline v8float fn( const v8float &a ) \ - { \ - v8float b; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - b.f[j] = ::fn( a.f[j] ); \ - return b; \ - } - -# define CMATH_FR2(fn) \ - inline v8float fn( const v8float &a, const v8float &b ) \ - { \ - v8float c; \ - ALWAYS_VECTORIZE \ - for( int j = 0; j < 8; j++ ) \ - c.f[j] = ::fn( a.f[j], b.f[j] ); \ - return c; \ - } - - CMATH_FR1(acos) CMATH_FR1(asin) CMATH_FR1(atan) CMATH_FR2(atan2) - CMATH_FR1(ceil) CMATH_FR1(cos) CMATH_FR1(cosh) CMATH_FR1(exp) - CMATH_FR1(fabs) CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log) - CMATH_FR1(log10) CMATH_FR2(pow) CMATH_FR1(sin) CMATH_FR1(sinh) - CMATH_FR1(sqrt) CMATH_FR1(tan) CMATH_FR1(tanh) - - inline v8float copysign( const v8float &a, const v8float &b ) - { +} + +// v8float binary operators + +#define BINARY( op ) \ + inline v8float operator op( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + c.f[j] = a.f[j] op b.f[j]; \ + return c; \ + } + +BINARY( +) +BINARY( -) +BINARY( * ) +BINARY( / ) + +#undef BINARY + +// v8float logical operators + +#define LOGICAL( op ) \ + inline v8int operator op( const v8float& a, const v8float& b ) \ + { \ + v8int c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + c.i[j] = -( a.f[j] op b.f[j] ); \ + return c; \ + } + +LOGICAL( < ) +LOGICAL( > ) +LOGICAL( == ) +LOGICAL( != ) +LOGICAL( <= ) +LOGICAL( >= ) +LOGICAL( &&) +LOGICAL( || ) + +#undef LOGICAL + +// v8float math library functions + +#define CMATH_FR1( fn ) \ + inline v8float fn( const v8float& a ) \ + { \ + v8float b; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + b.f[j] = ::fn( a.f[j] ); \ + return b; \ + } + +#define CMATH_FR2( fn ) \ + inline v8float fn( const v8float& a, const v8float& b ) \ + { \ + v8float c; \ + ALWAYS_VECTORIZE \ + for ( int j = 0; j < 8; j++ ) \ + c.f[j] = ::fn( a.f[j], b.f[j] ); \ + return c; \ + } + +CMATH_FR1( acos ) +CMATH_FR1( asin ) CMATH_FR1( atan ) CMATH_FR2( atan2 ) CMATH_FR1( ceil ) + CMATH_FR1( cos ) CMATH_FR1( cosh ) CMATH_FR1( exp ) CMATH_FR1( fabs ) + CMATH_FR1( floor ) CMATH_FR2( fmod ) CMATH_FR1( log ) CMATH_FR1( log10 ) + CMATH_FR2( pow ) CMATH_FR1( sin ) CMATH_FR1( sinh ) + CMATH_FR1( sqrt ) CMATH_FR1( tan ) CMATH_FR1( tanh ) + + inline v8float + copysign( const v8float& a, const v8float& b ) +{ v8float c; float t; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) + for ( int j = 0; j < 8; j++ ) { - t = ::fabs( a.f[j] ); - if( b.f[j] < 0 ) t = -t; - c.f[j] = t; + t = ::fabs( a.f[j] ); + if ( b.f[j] < 0 ) + t = -t; + c.f[j] = t; } return c; - } +} -# undef CMATH_FR1 -# undef CMATH_FR2 +#undef CMATH_FR1 +#undef CMATH_FR2 - // v8float miscellaneous functions +// v8float miscellaneous functions - inline v8float rsqrt_approx( const v8float &a ) - { +inline v8float rsqrt_approx( const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + for ( int j = 0; j < 8; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v8float rsqrt( const v8float &a ) - { +inline v8float rsqrt( const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = ::sqrt( 1.0f / a.f[j] ); + for ( int j = 0; j < 8; j++ ) + b.f[j] = ::sqrt( 1.0f / a.f[j] ); return b; - } +} - inline v8float rcp_approx( const v8float &a ) - { +inline v8float rcp_approx( const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = 1.0f / a.f[j]; + for ( int j = 0; j < 8; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v8float rcp( const v8float &a ) - { +inline v8float rcp( const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.f[j] = 1.0f / a.f[j]; + for ( int j = 0; j < 8; j++ ) + b.f[j] = 1.0f / a.f[j]; return b; - } +} - inline v8float fma( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fma( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - d.f[j] = a.f[j] * b.f[j] + c.f[j]; + for ( int j = 0; j < 8; j++ ) + d.f[j] = a.f[j] * b.f[j] + c.f[j]; return d; - } +} - inline v8float fms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - d.f[j] = a.f[j] * b.f[j] - c.f[j]; + for ( int j = 0; j < 8; j++ ) + d.f[j] = a.f[j] * b.f[j] - c.f[j]; return d; - } +} - inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) - { +inline v8float fnms( const v8float& a, const v8float& b, const v8float& c ) +{ v8float d; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - d.f[j] = c.f[j] - a.f[j] * b.f[j]; + for ( int j = 0; j < 8; j++ ) + d.f[j] = c.f[j] - a.f[j] * b.f[j]; return d; - } +} - inline v8float clear_bits( const v8int &m, const v8float &a ) - { +inline v8float clear_bits( const v8int& m, const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = ( ~m.i[j] ) & a.i[j]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = ( ~m.i[j] ) & a.i[j]; return b; - } +} - inline v8float set_bits( const v8int &m, const v8float &a ) - { +inline v8float set_bits( const v8int& m, const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = m.i[j] | a.i[j]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = m.i[j] | a.i[j]; return b; - } +} - inline v8float toggle_bits( const v8int &m, const v8float &a ) - { +inline v8float toggle_bits( const v8int& m, const v8float& a ) +{ v8float b; ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - b.i[j] = m.i[j] ^ a.i[j]; + for ( int j = 0; j < 8; j++ ) + b.i[j] = m.i[j] ^ a.i[j]; return b; - } +} - inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void increment_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - p[j] += a.f[j]; - } + for ( int j = 0; j < 8; j++ ) + p[j] += a.f[j]; +} - inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void decrement_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - p[j] -= a.f[j]; - } + for ( int j = 0; j < 8; j++ ) + p[j] -= a.f[j]; +} - inline void scale_8x1( float * ALIGNED(16) p, const v8float &a ) - { +inline void scale_8x1( float* ALIGNED( 16 ) p, const v8float& a ) +{ ALWAYS_VECTORIZE - for( int j = 0; j < 8; j++ ) - p[j] *= a.f[j]; - } + for ( int j = 0; j < 8; j++ ) + p[j] *= a.f[j]; +} } // namespace v8 diff --git a/src/vpic/dumpmacros.h b/src/vpic/dumpmacros.h index 9e46bf6b..264c12a5 100644 --- a/src/vpic/dumpmacros.h +++ b/src/vpic/dumpmacros.h @@ -4,50 +4,54 @@ /* FIXME: WHEN THESE MACROS WERE HOISTED AND VARIOUS HACKS DONE TO THEM THEY BECAME _VERY_ _DANGEROUS. */ -#define WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \ - /* Binary compatibility information */ \ - WRITE( char, CHAR_BIT, fileIO ); \ - WRITE( char, sizeof(short int), fileIO ); \ - WRITE( char, sizeof(int), fileIO ); \ - WRITE( char, sizeof(float), fileIO ); \ - WRITE( char, sizeof(double), fileIO ); \ - WRITE( short int, 0xcafe, fileIO ); \ - WRITE( int, 0xdeadbeef, fileIO ); \ - WRITE( float, 1.0, fileIO ); \ - WRITE( double, 1.0, fileIO ); \ - /* Dump type and header format version */ \ - WRITE( int, 0 /* Version */, fileIO ); \ - WRITE( int, dump_type, fileIO ); \ - /* High level information */ \ - WRITE( int, step(), fileIO ); \ - WRITE( int, nxout, fileIO ); \ - WRITE( int, nyout, fileIO ); \ - WRITE( int, nzout, fileIO ); \ - WRITE( float, grid->dt, fileIO ); \ - WRITE( float, dxout, fileIO ); \ - WRITE( float, dyout, fileIO ); \ - WRITE( float, dzout, fileIO ); \ - WRITE( float, grid->x0, fileIO ); \ - WRITE( float, grid->y0, fileIO ); \ - WRITE( float, grid->z0, fileIO ); \ - WRITE( float, grid->cvac, fileIO ); \ - WRITE( float, grid->eps0, fileIO ); \ - WRITE( float, 0 /* damp */, fileIO ); \ - WRITE( int, rank(), fileIO ); \ - WRITE( int, nproc(), fileIO ); \ - /* Species parameters */ \ - WRITE( int, sp_id, fileIO ); \ - WRITE( float, q_m, fileIO ); \ - } while(0) - +#define WRITE_HEADER_V0( dump_type, sp_id, q_m, fileIO ) \ + do \ + { \ + /* Binary compatibility information */ \ + WRITE( char, CHAR_BIT, fileIO ); \ + WRITE( char, sizeof( short int ), fileIO ); \ + WRITE( char, sizeof( int ), fileIO ); \ + WRITE( char, sizeof( float ), fileIO ); \ + WRITE( char, sizeof( double ), fileIO ); \ + WRITE( short int, 0xcafe, fileIO ); \ + WRITE( int, 0xdeadbeef, fileIO ); \ + WRITE( float, 1.0, fileIO ); \ + WRITE( double, 1.0, fileIO ); \ + /* Dump type and header format version */ \ + WRITE( int, 0 /* Version */, fileIO ); \ + WRITE( int, dump_type, fileIO ); \ + /* High level information */ \ + WRITE( int, step(), fileIO ); \ + WRITE( int, nxout, fileIO ); \ + WRITE( int, nyout, fileIO ); \ + WRITE( int, nzout, fileIO ); \ + WRITE( float, grid->dt, fileIO ); \ + WRITE( float, dxout, fileIO ); \ + WRITE( float, dyout, fileIO ); \ + WRITE( float, dzout, fileIO ); \ + WRITE( float, grid->x0, fileIO ); \ + WRITE( float, grid->y0, fileIO ); \ + WRITE( float, grid->z0, fileIO ); \ + WRITE( float, grid->cvac, fileIO ); \ + WRITE( float, grid->eps0, fileIO ); \ + WRITE( float, 0 /* damp */, fileIO ); \ + WRITE( int, rank(), fileIO ); \ + WRITE( int, nproc(), fileIO ); \ + /* Species parameters */ \ + WRITE( int, sp_id, fileIO ); \ + WRITE( float, q_m, fileIO ); \ + } while ( 0 ) + // Note dim _MUST_ be a pointer to an int - -#define WRITE_ARRAY_HEADER(p,ndim,dim,fileIO) do { \ - WRITE( int, sizeof(p[0]), fileIO ); \ - WRITE( int, ndim, fileIO ); \ - fileIO.write( dim, ndim ); \ - } while(0) - + +#define WRITE_ARRAY_HEADER( p, ndim, dim, fileIO ) \ + do \ + { \ + WRITE( int, sizeof( p[0] ), fileIO ); \ + WRITE( int, ndim, fileIO ); \ + fileIO.write( dim, ndim ); \ + } while ( 0 ) + // The WRITE macro copies the output "value" into a temporary variable // of the requested output "type" so that the write to the "file" // occurs from a known binary data type. For example, if grid.dx were @@ -60,118 +64,137 @@ // single precision write copies. However, specialty types could be // created so that the type cast __WRITE_tmp = (type)(value) // automatically does the underlying conversion in C++ - -#define WRITE(type,value,fileIO) do { \ - type __WRITE_tmp = (type)(value); \ - fileIO.write( &__WRITE_tmp, 1 ); \ - } while(0) - + +#define WRITE( type, value, fileIO ) \ + do \ + { \ + type __WRITE_tmp = ( type )( value ); \ + fileIO.write( &__WRITE_tmp, 1 ); \ + } while ( 0 ) + // Note: strlen does not include the terminating \0 -#define WRITE_STRING(string,fileIO) do { \ - int __WRITE_STRING_len = 0; \ - if( string ) __WRITE_STRING_len = strlen(string); \ - fileIO.write( &__WRITE_STRING_len, 1 ); \ - if( __WRITE_STRING_len>0 ) \ - fileIO.write( string, __WRITE_STRING_len ); \ - } while(0) - -#define READ(type,value,fileIO) do { \ - type __READ_tmp; \ - fileIO.read(&__READ_tmp, 1 ); \ - (value) = __READ_tmp; \ - } while(0) - -#define F_WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \ - /* Binary compatibility information */ \ - F_WRITE( char, CHAR_BIT, fileIO ); \ - F_WRITE( char, sizeof(short int), fileIO ); \ - F_WRITE( char, sizeof(int), fileIO ); \ - F_WRITE( char, sizeof(float), fileIO ); \ - F_WRITE( char, sizeof(double), fileIO ); \ - F_WRITE( short int, 0xcafe, fileIO ); \ - F_WRITE( int, 0xdeadbeef, fileIO ); \ - F_WRITE( float, 1.0, fileIO ); \ - F_WRITE( double, 1.0, fileIO ); \ - /* Dump type and header format version */ \ - F_WRITE( int, 0 /* Version */, fileIO ); \ - F_WRITE( int, dump_type, fileIO ); \ - /* High level information */ \ - F_WRITE( int, step(), fileIO ); \ - F_WRITE( int, imxstr-2, fileIO ); \ - F_WRITE( int, jmxstr-2, fileIO ); \ - F_WRITE( int, kmxstr-2, fileIO ); \ - F_WRITE( float, grid->dt, fileIO ); \ - F_WRITE( float, dxstr, fileIO ); \ - F_WRITE( float, dystr, fileIO ); \ - F_WRITE( float, dzstr, fileIO ); \ - F_WRITE( float, grid->x0, fileIO ); \ - F_WRITE( float, grid->y0, fileIO ); \ - F_WRITE( float, grid->z0, fileIO ); \ - F_WRITE( float, grid->cvac, fileIO ); \ - F_WRITE( float, grid->eps0, fileIO ); \ - F_WRITE( float, 0 /*damp*/, fileIO ); \ - F_WRITE( int, rank(), fileIO ); \ - F_WRITE( int, nproc(), fileIO ); \ - /* Species parameters */ \ - F_WRITE( int, sp_id, fileIO ); \ - F_WRITE( float, q_m, fileIO ); \ - } while(0) - -#define F_WRITE_HEADER_PAR(dump_type,sp_id,q_m,fileIO) do { \ - /* Binary compatibility information */ \ - F_WRITE( char, CHAR_BIT, fileIO ); \ - F_WRITE( char, sizeof(short int), fileIO ); \ - F_WRITE( char, sizeof(int), fileIO ); \ - F_WRITE( char, sizeof(float), fileIO ); \ - F_WRITE( char, sizeof(double), fileIO ); \ - F_WRITE( short int, 0xcafe, fileIO ); \ - F_WRITE( int, 0xdeadbeef, fileIO ); \ - F_WRITE( float, 1.0, fileIO ); \ - F_WRITE( double, 1.0, fileIO ); \ - /* Dump type and header format version */ \ - F_WRITE( int, 0 /* Version */, fileIO ); \ - F_WRITE( int, dump_type, fileIO ); \ - /* High level information */ \ - F_WRITE( int, step(), fileIO ); \ - F_WRITE( int, grid->nx, fileIO ); \ - F_WRITE( int, grid->ny, fileIO ); \ - F_WRITE( int, grid->nz, fileIO ); \ - F_WRITE( float, grid->dt, fileIO ); \ - F_WRITE( float, grid->dx, fileIO ); \ - F_WRITE( float, grid->dy, fileIO ); \ - F_WRITE( float, grid->dz, fileIO ); \ - F_WRITE( float, grid->x0, fileIO ); \ - F_WRITE( float, grid->y0, fileIO ); \ - F_WRITE( float, grid->z0, fileIO ); \ - F_WRITE( float, grid->cvac, fileIO ); \ - F_WRITE( float, grid->eps0, fileIO ); \ - F_WRITE( float, 0 /*damp*/, fileIO ); \ - F_WRITE( int, rank(), fileIO ); \ - F_WRITE( int, nproc(), fileIO ); \ - /* Species parameters */ \ - F_WRITE( int, sp_id, fileIO ); \ - F_WRITE( float, q_m, fileIO ); \ - } while(0) - +#define WRITE_STRING( string, fileIO ) \ + do \ + { \ + int __WRITE_STRING_len = 0; \ + if ( string ) \ + __WRITE_STRING_len = strlen( string ); \ + fileIO.write( &__WRITE_STRING_len, 1 ); \ + if ( __WRITE_STRING_len > 0 ) \ + fileIO.write( string, __WRITE_STRING_len ); \ + } while ( 0 ) + +#define READ( type, value, fileIO ) \ + do \ + { \ + type __READ_tmp; \ + fileIO.read( &__READ_tmp, 1 ); \ + ( value ) = __READ_tmp; \ + } while ( 0 ) + +#define F_WRITE_HEADER_V0( dump_type, sp_id, q_m, fileIO ) \ + do \ + { \ + /* Binary compatibility information */ \ + F_WRITE( char, CHAR_BIT, fileIO ); \ + F_WRITE( char, sizeof( short int ), fileIO ); \ + F_WRITE( char, sizeof( int ), fileIO ); \ + F_WRITE( char, sizeof( float ), fileIO ); \ + F_WRITE( char, sizeof( double ), fileIO ); \ + F_WRITE( short int, 0xcafe, fileIO ); \ + F_WRITE( int, 0xdeadbeef, fileIO ); \ + F_WRITE( float, 1.0, fileIO ); \ + F_WRITE( double, 1.0, fileIO ); \ + /* Dump type and header format version */ \ + F_WRITE( int, 0 /* Version */, fileIO ); \ + F_WRITE( int, dump_type, fileIO ); \ + /* High level information */ \ + F_WRITE( int, step(), fileIO ); \ + F_WRITE( int, imxstr - 2, fileIO ); \ + F_WRITE( int, jmxstr - 2, fileIO ); \ + F_WRITE( int, kmxstr - 2, fileIO ); \ + F_WRITE( float, grid->dt, fileIO ); \ + F_WRITE( float, dxstr, fileIO ); \ + F_WRITE( float, dystr, fileIO ); \ + F_WRITE( float, dzstr, fileIO ); \ + F_WRITE( float, grid->x0, fileIO ); \ + F_WRITE( float, grid->y0, fileIO ); \ + F_WRITE( float, grid->z0, fileIO ); \ + F_WRITE( float, grid->cvac, fileIO ); \ + F_WRITE( float, grid->eps0, fileIO ); \ + F_WRITE( float, 0 /*damp*/, fileIO ); \ + F_WRITE( int, rank(), fileIO ); \ + F_WRITE( int, nproc(), fileIO ); \ + /* Species parameters */ \ + F_WRITE( int, sp_id, fileIO ); \ + F_WRITE( float, q_m, fileIO ); \ + } while ( 0 ) + +#define F_WRITE_HEADER_PAR( dump_type, sp_id, q_m, fileIO ) \ + do \ + { \ + /* Binary compatibility information */ \ + F_WRITE( char, CHAR_BIT, fileIO ); \ + F_WRITE( char, sizeof( short int ), fileIO ); \ + F_WRITE( char, sizeof( int ), fileIO ); \ + F_WRITE( char, sizeof( float ), fileIO ); \ + F_WRITE( char, sizeof( double ), fileIO ); \ + F_WRITE( short int, 0xcafe, fileIO ); \ + F_WRITE( int, 0xdeadbeef, fileIO ); \ + F_WRITE( float, 1.0, fileIO ); \ + F_WRITE( double, 1.0, fileIO ); \ + /* Dump type and header format version */ \ + F_WRITE( int, 0 /* Version */, fileIO ); \ + F_WRITE( int, dump_type, fileIO ); \ + /* High level information */ \ + F_WRITE( int, step(), fileIO ); \ + F_WRITE( int, grid->nx, fileIO ); \ + F_WRITE( int, grid->ny, fileIO ); \ + F_WRITE( int, grid->nz, fileIO ); \ + F_WRITE( float, grid->dt, fileIO ); \ + F_WRITE( float, grid->dx, fileIO ); \ + F_WRITE( float, grid->dy, fileIO ); \ + F_WRITE( float, grid->dz, fileIO ); \ + F_WRITE( float, grid->x0, fileIO ); \ + F_WRITE( float, grid->y0, fileIO ); \ + F_WRITE( float, grid->z0, fileIO ); \ + F_WRITE( float, grid->cvac, fileIO ); \ + F_WRITE( float, grid->eps0, fileIO ); \ + F_WRITE( float, 0 /*damp*/, fileIO ); \ + F_WRITE( int, rank(), fileIO ); \ + F_WRITE( int, nproc(), fileIO ); \ + /* Species parameters */ \ + F_WRITE( int, sp_id, fileIO ); \ + F_WRITE( float, q_m, fileIO ); \ + } while ( 0 ) + // Note dim _MUST_ be a pointer to an int - -#define F_WRITE_ARRAY_HEADER(psiz,ndim,dim,fileIO) do { \ - F_WRITE( int, psiz, fileIO ); \ - F_WRITE( int, ndim, fileIO ); \ - fileIO.write( dim, ndim ); \ - } while(0) - -#define F_WRITE(type,value,fileIO) do { \ - type __F_WRITE_tmp = (type)(value); \ - fileIO.write( &__F_WRITE_tmp, 1 ); \ - } while(0) - -#define F_READ(type,value,fileIO) do { \ - type __F_READ_tmp; \ - fileIO.read( &__F_READ_tmp, 1 ); \ - (value) = __F_READ_tmp; \ - } while(0) - -#define ABORT(cond) if( cond ) ERROR(( #cond )) + +#define F_WRITE_ARRAY_HEADER( psiz, ndim, dim, fileIO ) \ + do \ + { \ + F_WRITE( int, psiz, fileIO ); \ + F_WRITE( int, ndim, fileIO ); \ + fileIO.write( dim, ndim ); \ + } while ( 0 ) + +#define F_WRITE( type, value, fileIO ) \ + do \ + { \ + type __F_WRITE_tmp = ( type )( value ); \ + fileIO.write( &__F_WRITE_tmp, 1 ); \ + } while ( 0 ) + +#define F_READ( type, value, fileIO ) \ + do \ + { \ + type __F_READ_tmp; \ + fileIO.read( &__F_READ_tmp, 1 ); \ + ( value ) = __F_READ_tmp; \ + } while ( 0 ) + +#define ABORT( cond ) \ + if ( cond ) \ + ERROR( ( #cond ) ) #endif // dumpmacros_h diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h index 9c22048e..da2e425d 100644 --- a/src/vpic/vpic.h +++ b/src/vpic/vpic.h @@ -13,16 +13,16 @@ #ifndef vpic_h #define vpic_h -#include #include +#include #include "../boundary/boundary.h" #include "../collision/collision.h" #include "../emitter/emitter.h" // FIXME: INCLUDES ONCE ALL IS CLEANED UP -#include "../util/io/FileIO.h" #include "../util/bitfield.h" #include "../util/checksum.h" +#include "../util/io/FileIO.h" #include "../util/system.h" #ifndef USER_GLOBAL_SIZE @@ -36,640 +36,672 @@ typedef FileIO FILETYPE; -const uint32_t all (0xffffffff); -const uint32_t electric (1<<0 | 1<<1 | 1<<2); -const uint32_t div_e_err (1<<3); -const uint32_t magnetic (1<<4 | 1<<5 | 1<<6); -const uint32_t div_b_err (1<<7); -const uint32_t tca (1<<8 | 1<<9 | 1<<10); -const uint32_t rhob (1<<11); -const uint32_t current (1<<12 | 1<<13 | 1<<14); -const uint32_t rhof (1<<15); -const uint32_t emat (1<<16 | 1<<17 | 1<<18); -const uint32_t nmat (1<<19); -const uint32_t fmat (1<<20 | 1<<21 | 1<<22); -const uint32_t cmat (1<<23); - -const size_t total_field_variables(24); -const size_t total_field_groups(12); // this counts vectors, tensors etc... +const uint32_t all( 0xffffffff ); +const uint32_t electric( 1 << 0 | 1 << 1 | 1 << 2 ); +const uint32_t div_e_err( 1 << 3 ); +const uint32_t magnetic( 1 << 4 | 1 << 5 | 1 << 6 ); +const uint32_t div_b_err( 1 << 7 ); +const uint32_t tca( 1 << 8 | 1 << 9 | 1 << 10 ); +const uint32_t rhob( 1 << 11 ); +const uint32_t current( 1 << 12 | 1 << 13 | 1 << 14 ); +const uint32_t rhof( 1 << 15 ); +const uint32_t emat( 1 << 16 | 1 << 17 | 1 << 18 ); +const uint32_t nmat( 1 << 19 ); +const uint32_t fmat( 1 << 20 | 1 << 21 | 1 << 22 ); +const uint32_t cmat( 1 << 23 ); + +const size_t total_field_variables( 24 ); +const size_t total_field_groups( 12 ); // this counts vectors, tensors etc... // These bits will be tested to determine which variables to output -const size_t field_indeces[12] = { 0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23 }; - -struct FieldInfo { - char name[128]; - char degree[128]; - char elements[128]; - char type[128]; - size_t size; +const size_t field_indeces[12] = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23}; + +struct FieldInfo +{ + char name[128]; + char degree[128]; + char elements[128]; + char type[128]; + size_t size; }; // struct FieldInfo -const uint32_t current_density (1<<0 | 1<<1 | 1<<2); -const uint32_t charge_density (1<<3); -const uint32_t momentum_density (1<<4 | 1<<5 | 1<<6); -const uint32_t ke_density (1<<7); -const uint32_t stress_tensor (1<<8 | 1<<9 | 1<<10 | 1<<11 | 1<<12 | 1<<13); +const uint32_t current_density( 1 << 0 | 1 << 1 | 1 << 2 ); +const uint32_t charge_density( 1 << 3 ); +const uint32_t momentum_density( 1 << 4 | 1 << 5 | 1 << 6 ); +const uint32_t ke_density( 1 << 7 ); +const uint32_t stress_tensor( 1 << 8 | 1 << 9 | 1 << 10 | 1 << 11 | 1 << 12 | + 1 << 13 ); /* May want to use these instead const uint32_t stress_diagonal (1<<8 | 1<<9 | 1<<10); const uint32_t stress_offdiagonal (1<<11 | 1<<12 | 1<<13); */ -const size_t total_hydro_variables(14); -const size_t total_hydro_groups(5); // this counts vectors, tensors etc... +const size_t total_hydro_variables( 14 ); +const size_t total_hydro_groups( 5 ); // this counts vectors, tensors etc... // These bits will be tested to determine which variables to output -const size_t hydro_indeces[5] = { 0, 3, 4, 7, 8 }; - -struct HydroInfo { - char name[128]; - char degree[128]; - char elements[128]; - char type[128]; - size_t size; +const size_t hydro_indeces[5] = {0, 3, 4, 7, 8}; + +struct HydroInfo +{ + char name[128]; + char degree[128]; + char elements[128]; + char type[128]; + size_t size; }; // struct FieldInfo /*---------------------------------------------------------------------------- * DumpFormat Enumeration ----------------------------------------------------------------------------*/ -enum DumpFormat { - band = 0, - band_interleave = 1 +enum DumpFormat +{ + band = 0, + band_interleave = 1 }; // enum DumpFormat /*---------------------------------------------------------------------------- * DumpParameters Struct ----------------------------------------------------------------------------*/ -struct DumpParameters { +struct DumpParameters +{ - void output_variables(uint32_t mask) { - output_vars.set(mask); - } // output_variables + void output_variables( uint32_t mask ) + { + output_vars.set( mask ); + } // output_variables - BitField output_vars; + BitField output_vars; - size_t stride_x; - size_t stride_y; - size_t stride_z; + size_t stride_x; + size_t stride_y; + size_t stride_z; - DumpFormat format; + DumpFormat format; - char name[128]; - char baseDir[128]; - char baseFileName[128]; + char name[128]; + char baseDir[128]; + char baseFileName[128]; }; // struct DumpParameters -class vpic_simulation { -public: - vpic_simulation(); - ~vpic_simulation(); - void initialize( int argc, char **argv ); - void modify( const char *fname ); - int advance( void ); - void finalize( void ); - -protected: - - // Directly initialized by user - - int verbose; // Should system be verbose - int num_step; // Number of steps to take - int num_comm_round; // Num comm round - int status_interval; // How often to print status messages - int clean_div_e_interval; // How often to clean div e - int num_div_e_round; // How many clean div e rounds per div e interval - int clean_div_b_interval; // How often to clean div b - int num_div_b_round; // How many clean div b rounds per div b interval - int sync_shared_interval; // How often to synchronize shared faces - - // FIXME: THESE INTERVALS SHOULDN'T BE PART OF vpic_simulation - // THE BIG LIST FOLLOWING IT SHOULD BE CLEANED UP TOO - - double quota; - int checkpt_interval; - int hydro_interval; - int field_interval; - int particle_interval; - - size_t nxout, nyout, nzout; - size_t px, py, pz; - float dxout, dyout, dzout; - - int ndfld; - int ndhyd; - int ndpar; - int ndhis; - int ndgrd; - int head_option; - int istride; - int jstride; - int kstride; - int stride_option; - int pstride; - int nprobe; - int ijkprobe[NVARHISMX][4]; - float xyzprobe[NVARHISMX][3]; - int block_dump; - int stepdigit; - int rankdigit; - int ifenergies; - - // Helper initialized by user - - /* There are enough synchronous and local random number generators - to permit the host thread plus all the pipeline threads for one - dispatcher to simultaneously produce both synchronous and local - random numbers. Keeping the synchronous generators in sync is - the generator users responsibility. */ - - rng_pool_t * entropy; // Local entropy pool - rng_pool_t * sync_entropy; // Synchronous entropy pool - grid_t * grid; // define_*_grid et al - material_t * material_list; // define_material - field_array_t * field_array; // define_field_array - interpolator_array_t * interpolator_array; // define_interpolator_array - accumulator_array_t * accumulator_array; // define_accumulator_array - hydro_array_t * hydro_array; // define_hydro_array - species_t * species_list; // define_species / - // species helpers - particle_bc_t * particle_bc_list; // define_particle_bc / - // boundary helpers - emitter_t * emitter_list; // define_emitter / - // emitter helpers - collision_op_t * collision_op_list; // collision helpers - - // User defined checkpt preserved variables - // Note: user_global is aliased with user_global_t (see deck_wrapper.cxx) - - char user_global[USER_GLOBAL_SIZE]; - - /*---------------------------------------------------------------------------- - * Diagnostics - ---------------------------------------------------------------------------*/ - double poynting_flux(double e0); - - /*---------------------------------------------------------------------------- - * Check Sums - ---------------------------------------------------------------------------*/ -#if defined(ENABLE_OPENSSL) - void output_checksum_fields(); - void checksum_fields(CheckSum & cs); - void output_checksum_species(const char * species); - void checksum_species(const char * species, CheckSum & cs); +class vpic_simulation +{ + public: + vpic_simulation(); + ~vpic_simulation(); + void initialize( int argc, char** argv ); + void modify( const char* fname ); + int advance( void ); + void finalize( void ); + + protected: + // Directly initialized by user + + int verbose; // Should system be verbose + int num_step; // Number of steps to take + int num_comm_round; // Num comm round + int status_interval; // How often to print status messages + int clean_div_e_interval; // How often to clean div e + int num_div_e_round; // How many clean div e rounds per div e interval + int clean_div_b_interval; // How often to clean div b + int num_div_b_round; // How many clean div b rounds per div b interval + int sync_shared_interval; // How often to synchronize shared faces + + // FIXME: THESE INTERVALS SHOULDN'T BE PART OF vpic_simulation + // THE BIG LIST FOLLOWING IT SHOULD BE CLEANED UP TOO + + double quota; + int checkpt_interval; + int hydro_interval; + int field_interval; + int particle_interval; + + size_t nxout, nyout, nzout; + size_t px, py, pz; + float dxout, dyout, dzout; + + int ndfld; + int ndhyd; + int ndpar; + int ndhis; + int ndgrd; + int head_option; + int istride; + int jstride; + int kstride; + int stride_option; + int pstride; + int nprobe; + int ijkprobe[NVARHISMX][4]; + float xyzprobe[NVARHISMX][3]; + int block_dump; + int stepdigit; + int rankdigit; + int ifenergies; + + // Helper initialized by user + + /* There are enough synchronous and local random number generators + to permit the host thread plus all the pipeline threads for one + dispatcher to simultaneously produce both synchronous and local + random numbers. Keeping the synchronous generators in sync is + the generator users responsibility. */ + + rng_pool_t* entropy; // Local entropy pool + rng_pool_t* sync_entropy; // Synchronous entropy pool + grid_t* grid; // define_*_grid et al + material_t* material_list; // define_material + field_array_t* field_array; // define_field_array + interpolator_array_t* interpolator_array; // define_interpolator_array + accumulator_array_t* accumulator_array; // define_accumulator_array + hydro_array_t* hydro_array; // define_hydro_array + species_t* species_list; // define_species / + // species helpers + particle_bc_t* particle_bc_list; // define_particle_bc / + // boundary helpers + emitter_t* emitter_list; // define_emitter / + // emitter helpers + collision_op_t* collision_op_list; // collision helpers + + // User defined checkpt preserved variables + // Note: user_global is aliased with user_global_t (see deck_wrapper.cxx) + + char user_global[USER_GLOBAL_SIZE]; + + /*---------------------------------------------------------------------------- + * Diagnostics + ---------------------------------------------------------------------------*/ + double poynting_flux( double e0 ); + + /*---------------------------------------------------------------------------- + * Check Sums + ---------------------------------------------------------------------------*/ +#if defined( ENABLE_OPENSSL ) + void output_checksum_fields(); + void checksum_fields( CheckSum& cs ); + void output_checksum_species( const char* species ); + void checksum_species( const char* species, CheckSum& cs ); #endif // ENABLE_OPENSSL - void print_available_ram() { - SystemRAM::print_available(); - } // print_available_ram - - /////////////// - // Dump helpers - - int dump_mkdir(const char * dname); - int dump_cwd(char * dname, size_t size); - - // Text dumps - void dump_energies( const char *fname, int append = 1 ); - void dump_materials( const char *fname ); - void dump_species( const char *fname ); - - // Binary dumps - void dump_grid( const char *fbase ); - void dump_fields( const char *fbase, int fname_tag = 1 ); - void dump_hydro( const char *sp_name, const char *fbase, - int fname_tag = 1 ); - void dump_particles( const char *sp_name, const char *fbase, - int fname_tag = 1 ); - - // convenience functions for simlog output - void create_field_list(char * strlist, DumpParameters & dumpParams); - void create_hydro_list(char * strlist, DumpParameters & dumpParams); - - void print_hashed_comment(FileIO & fileIO, const char * comment); - void global_header(const char * base, - std::vector dumpParams); - - void field_header(const char * fbase, DumpParameters & dumpParams); - void hydro_header(const char * speciesname, const char * hbase, - DumpParameters & dumpParams); - - void field_dump(DumpParameters & dumpParams); - void hydro_dump(const char * speciesname, DumpParameters & dumpParams); - - /////////////////// - // Useful accessors - - inline int - rank() { return world_rank; } - - inline int - nproc() { return world_size; } - - inline void - barrier() { mp_barrier(); } - - inline double - time() { - return grid->t0 + (double)grid->dt*(double)grid->step; - } - - inline int64_t & - step() { - return grid->step; - } - - inline field_t & - field( const int v ) { - return field_array->f[ v ]; - } - - inline int - voxel( const int ix, const int iy, const int iz ) { - return ix + grid->sy*iy + grid->sz*iz; - } - - inline field_t & - field( const int ix, const int iy, const int iz ) { - return field_array->f[ voxel(ix,iy,iz) ]; - } - - inline interpolator_t & - interpolator( const int v ) { - return interpolator_array->i[ v ]; - } - - inline interpolator_t & - interpolator( const int ix, const int iy, const int iz ) { - return interpolator_array->i[ voxel(ix,iy,iz) ]; - } - - inline hydro_t & - hydro( const int v ) { - return hydro_array->h[ v ]; - } - - inline hydro_t & - hydro( const int ix, const int iy, const int iz ) { - return hydro_array->h[ voxel(ix,iy,iz) ]; - } - - inline rng_t * - rng( const int n ) { - return entropy->rng[n]; - } - - inline rng_t * - sync_rng( const int n ) { - return sync_entropy->rng[n]; - } - - /////////////// - // Grid helpers - - inline void - define_units( float cvac, - float eps0 ) { - grid->cvac = cvac; - grid->eps0 = eps0; - } - - inline void - define_timestep( float dt, double t0 = 0, int64_t step = 0 ) { - grid->t0 = t0; - grid->dt = (float)dt; - grid->step = step; - } - - // The below functions automatically create partition simple grids with - // simple boundary conditions on the edges. - - inline void - define_periodic_grid( double xl, double yl, double zl, - double xh, double yh, double zh, - double gnx, double gny, double gnz, - double gpx, double gpy, double gpz ) { - px = size_t(gpx); py = size_t(gpy); pz = size_t(gpz); - partition_periodic_box( grid, xl, yl, zl, xh, yh, zh, - (int)gnx, (int)gny, (int)gnz, - (int)gpx, (int)gpy, (int)gpz ); - } - - inline void - define_absorbing_grid( double xl, double yl, double zl, - double xh, double yh, double zh, - double gnx, double gny, double gnz, - double gpx, double gpy, double gpz, int pbc ) { - px = size_t(gpx); py = size_t(gpy); pz = size_t(gpz); - partition_absorbing_box( grid, xl, yl, zl, xh, yh, zh, - (int)gnx, (int)gny, (int)gnz, - (int)gpx, (int)gpy, (int)gpz, - pbc ); - } - - inline void - define_reflecting_grid( double xl, double yl, double zl, - double xh, double yh, double zh, - double gnx, double gny, double gnz, - double gpx, double gpy, double gpz ) { - px = size_t(gpx); py = size_t(gpy); pz = size_t(gpz); - partition_metal_box( grid, xl, yl, zl, xh, yh, zh, - (int)gnx, (int)gny, (int)gnz, - (int)gpx, (int)gpy, (int)gpz ); - } - - // The below macros allow custom domains to be created - - // Creates a particle reflecting metal box in the local domain - inline void - size_domain( double lnx, double lny, double lnz ) { - size_grid(grid,(int)lnx,(int)lny,(int)lnz); - } - - // Attaches a local domain boundary to another domain - inline void join_domain( int boundary, double rank ) { - join_grid( grid, boundary, (int)rank ); - } - - // Sets the field boundary condition of a local domain boundary - inline void set_domain_field_bc( int boundary, int fbc ) { - set_fbc( grid, boundary, fbc ); - } - - // Sets the particle boundary condition of a local domain boundary - inline void set_domain_particle_bc( int boundary, int pbc ) { - set_pbc( grid, boundary, pbc ); - } - - /////////////////// - // Material helpers - - inline material_t * - define_material( const char * name, - double eps, - double mu = 1, - double sigma = 0, - double zeta = 0 ) { - return append_material( material( name, - eps, eps, eps, - mu, mu, mu, - sigma, sigma, sigma, - zeta, zeta, zeta ), &material_list ); - } - - inline material_t * - define_material( const char * name, - double epsx, double epsy, double epsz, - double mux, double muy, double muz, - double sigmax, double sigmay, double sigmaz, - double zetax = 0 , double zetay = 0, double zetaz = 0 ) { - return append_material( material( name, - epsx, epsy, epsz, - mux, muy, muz, - sigmax, sigmay, sigmaz, - zetax, zetay, zetaz ), &material_list ); - } - - inline material_t * - lookup_material( const char * name ) { - return find_material_name( name, material_list ); - } - - inline material_t * - lookup_material( material_id id ) { - return find_material_id( id, material_list ); - } - - ////////////////////// - // Field array helpers - - // If fa is provided, define_field_advance will use it (and take ownership - // of the it). Otherwise the standard field array will be used with the - // optionally provided radition damping parameter. - - inline void - define_field_array( field_array_t * fa = NULL, double damp = 0 ) { - int nx1 = grid->nx + 1, ny1 = grid->ny+1, nz1 = grid->nz+1; - - if( grid->nx<1 || grid->ny<1 || grid->nz<1 ) - ERROR(( "Define your grid before defining the field array" )); - if( !material_list ) - ERROR(( "Define your materials before defining the field array" )); - - field_array = fa ? fa : - new_standard_field_array( grid, material_list, damp ); - interpolator_array = new_interpolator_array( grid ); - accumulator_array = new_accumulator_array( grid ); - hydro_array = new_hydro_array( grid ); - - // Pre-size communications buffers. This is done to get most memory - // allocation over with before the simulation starts running - - mp_size_recv_buffer(grid->mp,BOUNDARY(-1, 0, 0),ny1*nz1*sizeof(hydro_t)); - mp_size_recv_buffer(grid->mp,BOUNDARY( 1, 0, 0),ny1*nz1*sizeof(hydro_t)); - mp_size_recv_buffer(grid->mp,BOUNDARY( 0,-1, 0),nz1*nx1*sizeof(hydro_t)); - mp_size_recv_buffer(grid->mp,BOUNDARY( 0, 1, 0),nz1*nx1*sizeof(hydro_t)); - mp_size_recv_buffer(grid->mp,BOUNDARY( 0, 0,-1),nx1*ny1*sizeof(hydro_t)); - mp_size_recv_buffer(grid->mp,BOUNDARY( 0, 0, 1),nx1*ny1*sizeof(hydro_t)); - - mp_size_send_buffer(grid->mp,BOUNDARY(-1, 0, 0),ny1*nz1*sizeof(hydro_t)); - mp_size_send_buffer(grid->mp,BOUNDARY( 1, 0, 0),ny1*nz1*sizeof(hydro_t)); - mp_size_send_buffer(grid->mp,BOUNDARY( 0,-1, 0),nz1*nx1*sizeof(hydro_t)); - mp_size_send_buffer(grid->mp,BOUNDARY( 0, 1, 0),nz1*nx1*sizeof(hydro_t)); - mp_size_send_buffer(grid->mp,BOUNDARY( 0, 0,-1),nx1*ny1*sizeof(hydro_t)); - mp_size_send_buffer(grid->mp,BOUNDARY( 0, 0, 1),nx1*ny1*sizeof(hydro_t)); - } - - // Other field helpers are provided by macros in deck_wrapper.cxx - - ////////////////// - // Species helpers - - // FIXME: SILLY PROMOTIONS - inline species_t * - define_species( const char *name, - double q, - double m, - double max_local_np, - double max_local_nm, - double sort_interval, - double sort_out_of_place ) { - // Compute a reasonble number of movers if user did not specify - // Based on the twice the number of particles expected to hit the boundary - // of a wpdt=0.2 / dx=lambda species in a 3x3x3 domain - if( max_local_nm<0 ) { - max_local_nm = 2*max_local_np/25; - if( max_local_nm<16*(MAX_PIPELINE+1) ) - max_local_nm = 16*(MAX_PIPELINE+1); - } - return append_species( species( name, (float)q, (float)m, - (size_t)max_local_np, (size_t)max_local_nm, - (int)sort_interval, (int)sort_out_of_place, - grid ), &species_list ); - } - - inline species_t * - find_species( const char *name ) { - return find_species_name( name, species_list ); - } - - inline species_t * - find_species( int32_t id ) { - return find_species_id( id, species_list ); - } - - /////////////////// - // Particle helpers - - // Note: Don't use injection with aging during initialization - - // Defaults in the declaration below enable backwards compatibility. - - void - inject_particle( species_t * sp, - double x, double y, double z, - double ux, double uy, double uz, - double w, double age = 0, int update_rhob = 1 ); - - // Inject particle raw is for power users! - // No nannyism _at_ _all_: - // - Availability of free stoarge is _not_ checked. - // - Particle displacements and voxel index are _not_ for validity. - // - The rhob field is _not_ updated. - // - Injection with displacment may use up movers (i.e. don't use - // injection with displacement during initialization). - // This injection is _ultra_ _fast_. - - inline void - inject_particle_raw( species_t * RESTRICT sp, - float dx, float dy, float dz, int32_t i, - float ux, float uy, float uz, float w ) - { - particle_t * RESTRICT p = sp->p + (sp->np++); - p->dx = dx; p->dy = dy; p->dz = dz; p->i = i; - p->ux = ux; p->uy = uy; p->uz = uz; p->w = w; - } - - // This variant does a raw inject and moves the particles - - inline void - inject_particle_raw( species_t * RESTRICT sp, - float dx, float dy, float dz, int32_t i, - float ux, float uy, float uz, float w, - float dispx, float dispy, float dispz, - int update_rhob ) - { - particle_t * RESTRICT p = sp->p + (sp->np++); - particle_mover_t * RESTRICT pm = sp->pm + sp->nm; - p->dx = dx; p->dy = dy; p->dz = dz; p->i = i; - p->ux = ux; p->uy = uy; p->uz = uz; p->w = w; - pm->dispx = dispx; pm->dispy = dispy; pm->dispz = dispz; pm->i = sp->np-1; - if( update_rhob ) accumulate_rhob( field_array->f, p, grid, -sp->q ); - sp->nm += move_p( sp->p, pm, accumulator_array->a, grid, sp->q ); - } - - ////////////////////////////////// - // Random number generator helpers - - // seed_rand seed the all the random number generators. The seed - // used for the individual generators is based off the user provided - // seed such each local generator in each process (rng[0:r-1]) gets - // a unique seed. Each synchronous generator (sync_rng[0:r-1]) gets a - // unique seed that does not overlap with the local generators - // (common across each process). Lastly, all these seeds are such - // that, no individual generator seeds are reused across different - // user seeds. - // FIXME: MTRAND DESPERATELY NEEDS A LARGER SEED SPACE! - - inline void seed_entropy( int base ) { - seed_rng_pool( entropy, base, 0 ); - seed_rng_pool( sync_entropy, base, 1 ); - } - - // Uniform random number on (low,high) (open interval) - // FIXME: IS THE INTERVAL STILL OPEN IN FINITE PRECISION - // AND IS THE OPEN INTERVAL REALLY WHAT USERS WANT?? - inline double uniform( rng_t * rng, double low, double high ) { - double dx = drand( rng ); - return low*(1-dx) + high*dx; - } - - // Normal random number with mean mu and standard deviation sigma - inline double normal( rng_t * rng, double mu, double sigma ) { - return mu + sigma*drandn( rng ); - } - - ///////////////////////////////// - // Emitter and particle bc helpers - - // Note that append_emitter is hacked to silently returne if the - // emitter is already in the list. This allows things like: - // - // define_surface_emitter( my_emitter( ... ), rgn ) - // ... or ... - // my_emit_t * e = my_emit( ... ) - // define_surface_emitter( e, rgn ) - // ... or ... - // my_emit_t * e = define_emitter( my_emit( ... ) ) - // define_surface_emitter( e, rng ) - // ... - // All to work. (Nominally, would like define_surface_emitter - // to evaluate to the value of e. But, alas, the way - // define_surface_emitter works and language limitations of - // strict C++ prevent this.) - - inline emitter_t * - define_emitter( emitter_t * e ) { - return append_emitter( e, &emitter_list ); - } - - inline particle_bc_t * - define_particle_bc( particle_bc_t * pbc ) { - return append_particle_bc( pbc, &particle_bc_list ); - } - - inline collision_op_t * - define_collision_op( collision_op_t * cop ) { - return append_collision_op( cop, &collision_op_list ); - } - - //////////////////////// - // Miscellaneous helpers - - inline void abort( double code ) { - nanodelay(2000000000); mp_abort((((int)code)<<17)+1); - } - - // Truncate "a" to the nearest integer multiple of "b" - inline double trunc_granular( double a, double b ) { return b*int(a/b); } - - // Compute the remainder of a/b - inline double remainder( double a, double b ) { return std::remainder(a,b); } - // remainder(a,b); - - // Compute the Courant length on a regular mesh - inline double courant_length( double lx, double ly, double lz, - double nx, double ny, double nz ) { - double w0, w1 = 0; - if( nx>1 ) w0 = nx/lx, w1 += w0*w0; - if( ny>1 ) w0 = ny/ly, w1 += w0*w0; - if( nz>1 ) w0 = nz/lz, w1 += w0*w0; - return sqrt(1/w1); - } - - ////////////////////////////////////////////////////////// - // These friends are used by the checkpt / restore service - - friend void checkpt_vpic_simulation( const vpic_simulation * vpic ); - friend vpic_simulation * restore_vpic_simulation( void ); - friend void reanimate_vpic_simulation( vpic_simulation * vpic ); - - //////////////////////////////////////////////////////////// - // User input deck provided functions (see deck_wrapper.cxx) - - void user_initialization( int argc, char **argv ); - void user_particle_injection(void); - void user_current_injection(void); - void user_field_injection(void); - void user_diagnostics(void); - void user_particle_collisions(void); + void print_available_ram() + { + SystemRAM::print_available(); + } // print_available_ram + + /////////////// + // Dump helpers + + int dump_mkdir( const char* dname ); + int dump_cwd( char* dname, size_t size ); + + // Text dumps + void dump_energies( const char* fname, int append = 1 ); + void dump_materials( const char* fname ); + void dump_species( const char* fname ); + + // Binary dumps + void dump_grid( const char* fbase ); + void dump_fields( const char* fbase, int fname_tag = 1 ); + void dump_hydro( const char* sp_name, const char* fbase, + int fname_tag = 1 ); + void dump_particles( const char* sp_name, const char* fbase, + int fname_tag = 1 ); + + // convenience functions for simlog output + void create_field_list( char* strlist, DumpParameters& dumpParams ); + void create_hydro_list( char* strlist, DumpParameters& dumpParams ); + + void print_hashed_comment( FileIO& fileIO, const char* comment ); + void global_header( const char* base, + std::vector dumpParams ); + + void field_header( const char* fbase, DumpParameters& dumpParams ); + void hydro_header( const char* speciesname, const char* hbase, + DumpParameters& dumpParams ); + + void field_dump( DumpParameters& dumpParams ); + void hydro_dump( const char* speciesname, DumpParameters& dumpParams ); + + /////////////////// + // Useful accessors + + inline int rank() { return world_rank; } + + inline int nproc() { return world_size; } + + inline void barrier() { mp_barrier(); } + + inline double time() + { + return grid->t0 + (double)grid->dt * (double)grid->step; + } + + inline int64_t& step() { return grid->step; } + + inline field_t& field( const int v ) { return field_array->f[v]; } + + inline int voxel( const int ix, const int iy, const int iz ) + { + return ix + grid->sy * iy + grid->sz * iz; + } + + inline field_t& field( const int ix, const int iy, const int iz ) + { + return field_array->f[voxel( ix, iy, iz )]; + } + + inline interpolator_t& interpolator( const int v ) + { + return interpolator_array->i[v]; + } + + inline interpolator_t& interpolator( const int ix, const int iy, + const int iz ) + { + return interpolator_array->i[voxel( ix, iy, iz )]; + } + + inline hydro_t& hydro( const int v ) { return hydro_array->h[v]; } + + inline hydro_t& hydro( const int ix, const int iy, const int iz ) + { + return hydro_array->h[voxel( ix, iy, iz )]; + } + + inline rng_t* rng( const int n ) { return entropy->rng[n]; } + + inline rng_t* sync_rng( const int n ) { return sync_entropy->rng[n]; } + + /////////////// + // Grid helpers + + inline void define_units( float cvac, float eps0 ) + { + grid->cvac = cvac; + grid->eps0 = eps0; + } + + inline void define_timestep( float dt, double t0 = 0, int64_t step = 0 ) + { + grid->t0 = t0; + grid->dt = (float)dt; + grid->step = step; + } + + // The below functions automatically create partition simple grids with + // simple boundary conditions on the edges. + + inline void define_periodic_grid( double xl, double yl, double zl, + double xh, double yh, double zh, + double gnx, double gny, double gnz, + double gpx, double gpy, double gpz ) + { + px = size_t( gpx ); + py = size_t( gpy ); + pz = size_t( gpz ); + partition_periodic_box( grid, xl, yl, zl, xh, yh, zh, (int)gnx, + (int)gny, (int)gnz, (int)gpx, (int)gpy, + (int)gpz ); + } + + inline void define_absorbing_grid( double xl, double yl, double zl, + double xh, double yh, double zh, + double gnx, double gny, double gnz, + double gpx, double gpy, double gpz, + int pbc ) + { + px = size_t( gpx ); + py = size_t( gpy ); + pz = size_t( gpz ); + partition_absorbing_box( grid, xl, yl, zl, xh, yh, zh, (int)gnx, + (int)gny, (int)gnz, (int)gpx, (int)gpy, + (int)gpz, pbc ); + } + + inline void define_reflecting_grid( double xl, double yl, double zl, + double xh, double yh, double zh, + double gnx, double gny, double gnz, + double gpx, double gpy, double gpz ) + { + px = size_t( gpx ); + py = size_t( gpy ); + pz = size_t( gpz ); + partition_metal_box( grid, xl, yl, zl, xh, yh, zh, (int)gnx, (int)gny, + (int)gnz, (int)gpx, (int)gpy, (int)gpz ); + } + + // The below macros allow custom domains to be created + + // Creates a particle reflecting metal box in the local domain + inline void size_domain( double lnx, double lny, double lnz ) + { + size_grid( grid, (int)lnx, (int)lny, (int)lnz ); + } + + // Attaches a local domain boundary to another domain + inline void join_domain( int boundary, double rank ) + { + join_grid( grid, boundary, (int)rank ); + } + + // Sets the field boundary condition of a local domain boundary + inline void set_domain_field_bc( int boundary, int fbc ) + { + set_fbc( grid, boundary, fbc ); + } + + // Sets the particle boundary condition of a local domain boundary + inline void set_domain_particle_bc( int boundary, int pbc ) + { + set_pbc( grid, boundary, pbc ); + } + + /////////////////// + // Material helpers + + inline material_t* define_material( const char* name, double eps, + double mu = 1, double sigma = 0, + double zeta = 0 ) + { + return append_material( material( name, eps, eps, eps, mu, mu, mu, + sigma, sigma, sigma, zeta, zeta, + zeta ), + &material_list ); + } + + inline material_t* define_material( const char* name, double epsx, + double epsy, double epsz, double mux, + double muy, double muz, double sigmax, + double sigmay, double sigmaz, + double zetax = 0, double zetay = 0, + double zetaz = 0 ) + { + return append_material( material( name, epsx, epsy, epsz, mux, muy, muz, + sigmax, sigmay, sigmaz, zetax, zetay, + zetaz ), + &material_list ); + } + + inline material_t* lookup_material( const char* name ) + { + return find_material_name( name, material_list ); + } + + inline material_t* lookup_material( material_id id ) + { + return find_material_id( id, material_list ); + } + + ////////////////////// + // Field array helpers + + // If fa is provided, define_field_advance will use it (and take ownership + // of the it). Otherwise the standard field array will be used with the + // optionally provided radition damping parameter. + + inline void define_field_array( field_array_t* fa = NULL, double damp = 0 ) + { + int nx1 = grid->nx + 1, ny1 = grid->ny + 1, nz1 = grid->nz + 1; + + if ( grid->nx < 1 || grid->ny < 1 || grid->nz < 1 ) + ERROR( ( "Define your grid before defining the field array" ) ); + if ( !material_list ) + ERROR( + ( "Define your materials before defining the field array" ) ); + + field_array = + fa ? fa : new_standard_field_array( grid, material_list, damp ); + interpolator_array = new_interpolator_array( grid ); + accumulator_array = new_accumulator_array( grid ); + hydro_array = new_hydro_array( grid ); + + // Pre-size communications buffers. This is done to get most memory + // allocation over with before the simulation starts running + + mp_size_recv_buffer( grid->mp, BOUNDARY( -1, 0, 0 ), + ny1 * nz1 * sizeof( hydro_t ) ); + mp_size_recv_buffer( grid->mp, BOUNDARY( 1, 0, 0 ), + ny1 * nz1 * sizeof( hydro_t ) ); + mp_size_recv_buffer( grid->mp, BOUNDARY( 0, -1, 0 ), + nz1 * nx1 * sizeof( hydro_t ) ); + mp_size_recv_buffer( grid->mp, BOUNDARY( 0, 1, 0 ), + nz1 * nx1 * sizeof( hydro_t ) ); + mp_size_recv_buffer( grid->mp, BOUNDARY( 0, 0, -1 ), + nx1 * ny1 * sizeof( hydro_t ) ); + mp_size_recv_buffer( grid->mp, BOUNDARY( 0, 0, 1 ), + nx1 * ny1 * sizeof( hydro_t ) ); + + mp_size_send_buffer( grid->mp, BOUNDARY( -1, 0, 0 ), + ny1 * nz1 * sizeof( hydro_t ) ); + mp_size_send_buffer( grid->mp, BOUNDARY( 1, 0, 0 ), + ny1 * nz1 * sizeof( hydro_t ) ); + mp_size_send_buffer( grid->mp, BOUNDARY( 0, -1, 0 ), + nz1 * nx1 * sizeof( hydro_t ) ); + mp_size_send_buffer( grid->mp, BOUNDARY( 0, 1, 0 ), + nz1 * nx1 * sizeof( hydro_t ) ); + mp_size_send_buffer( grid->mp, BOUNDARY( 0, 0, -1 ), + nx1 * ny1 * sizeof( hydro_t ) ); + mp_size_send_buffer( grid->mp, BOUNDARY( 0, 0, 1 ), + nx1 * ny1 * sizeof( hydro_t ) ); + } + + // Other field helpers are provided by macros in deck_wrapper.cxx + + ////////////////// + // Species helpers + + // FIXME: SILLY PROMOTIONS + inline species_t* define_species( const char* name, double q, double m, + double max_local_np, double max_local_nm, + double sort_interval, + double sort_out_of_place ) + { + // Compute a reasonble number of movers if user did not specify + // Based on the twice the number of particles expected to hit the + // boundary of a wpdt=0.2 / dx=lambda species in a 3x3x3 domain + if ( max_local_nm < 0 ) + { + max_local_nm = 2 * max_local_np / 25; + if ( max_local_nm < 16 * ( MAX_PIPELINE + 1 ) ) + max_local_nm = 16 * ( MAX_PIPELINE + 1 ); + } + return append_species( + species( name, (float)q, (float)m, (size_t)max_local_np, + (size_t)max_local_nm, (int)sort_interval, + (int)sort_out_of_place, grid ), + &species_list ); + } + + inline species_t* find_species( const char* name ) + { + return find_species_name( name, species_list ); + } + + inline species_t* find_species( int32_t id ) + { + return find_species_id( id, species_list ); + } + + /////////////////// + // Particle helpers + + // Note: Don't use injection with aging during initialization + + // Defaults in the declaration below enable backwards compatibility. + + void inject_particle( species_t* sp, double x, double y, double z, + double ux, double uy, double uz, double w, + double age = 0, int update_rhob = 1 ); + + // Inject particle raw is for power users! + // No nannyism _at_ _all_: + // - Availability of free stoarge is _not_ checked. + // - Particle displacements and voxel index are _not_ for validity. + // - The rhob field is _not_ updated. + // - Injection with displacment may use up movers (i.e. don't use + // injection with displacement during initialization). + // This injection is _ultra_ _fast_. + + inline void inject_particle_raw( species_t* RESTRICT sp, float dx, float dy, + float dz, int32_t i, float ux, float uy, + float uz, float w ) + { + particle_t* RESTRICT p = sp->p + ( sp->np++ ); + p->dx = dx; + p->dy = dy; + p->dz = dz; + p->i = i; + p->ux = ux; + p->uy = uy; + p->uz = uz; + p->w = w; + } + + // This variant does a raw inject and moves the particles + + inline void inject_particle_raw( species_t* RESTRICT sp, float dx, float dy, + float dz, int32_t i, float ux, float uy, + float uz, float w, float dispx, + float dispy, float dispz, int update_rhob ) + { + particle_t* RESTRICT p = sp->p + ( sp->np++ ); + particle_mover_t* RESTRICT pm = sp->pm + sp->nm; + p->dx = dx; + p->dy = dy; + p->dz = dz; + p->i = i; + p->ux = ux; + p->uy = uy; + p->uz = uz; + p->w = w; + pm->dispx = dispx; + pm->dispy = dispy; + pm->dispz = dispz; + pm->i = sp->np - 1; + if ( update_rhob ) + accumulate_rhob( field_array->f, p, grid, -sp->q ); + sp->nm += move_p( sp->p, pm, accumulator_array->a, grid, sp->q ); + } + + ////////////////////////////////// + // Random number generator helpers + + // seed_rand seed the all the random number generators. The seed + // used for the individual generators is based off the user provided + // seed such each local generator in each process (rng[0:r-1]) gets + // a unique seed. Each synchronous generator (sync_rng[0:r-1]) gets a + // unique seed that does not overlap with the local generators + // (common across each process). Lastly, all these seeds are such + // that, no individual generator seeds are reused across different + // user seeds. + // FIXME: MTRAND DESPERATELY NEEDS A LARGER SEED SPACE! + + inline void seed_entropy( int base ) + { + seed_rng_pool( entropy, base, 0 ); + seed_rng_pool( sync_entropy, base, 1 ); + } + + // Uniform random number on (low,high) (open interval) + // FIXME: IS THE INTERVAL STILL OPEN IN FINITE PRECISION + // AND IS THE OPEN INTERVAL REALLY WHAT USERS WANT?? + inline double uniform( rng_t* rng, double low, double high ) + { + double dx = drand( rng ); + return low * ( 1 - dx ) + high * dx; + } + + // Normal random number with mean mu and standard deviation sigma + inline double normal( rng_t* rng, double mu, double sigma ) + { + return mu + sigma * drandn( rng ); + } + + ///////////////////////////////// + // Emitter and particle bc helpers + + // Note that append_emitter is hacked to silently returne if the + // emitter is already in the list. This allows things like: + // + // define_surface_emitter( my_emitter( ... ), rgn ) + // ... or ... + // my_emit_t * e = my_emit( ... ) + // define_surface_emitter( e, rgn ) + // ... or ... + // my_emit_t * e = define_emitter( my_emit( ... ) ) + // define_surface_emitter( e, rng ) + // ... + // All to work. (Nominally, would like define_surface_emitter + // to evaluate to the value of e. But, alas, the way + // define_surface_emitter works and language limitations of + // strict C++ prevent this.) + + inline emitter_t* define_emitter( emitter_t* e ) + { + return append_emitter( e, &emitter_list ); + } + + inline particle_bc_t* define_particle_bc( particle_bc_t* pbc ) + { + return append_particle_bc( pbc, &particle_bc_list ); + } + + inline collision_op_t* define_collision_op( collision_op_t* cop ) + { + return append_collision_op( cop, &collision_op_list ); + } + + //////////////////////// + // Miscellaneous helpers + + inline void abort( double code ) + { + nanodelay( 2000000000 ); + mp_abort( ( ( (int)code ) << 17 ) + 1 ); + } + + // Truncate "a" to the nearest integer multiple of "b" + inline double trunc_granular( double a, double b ) + { + return b * int( a / b ); + } + + // Compute the remainder of a/b + inline double remainder( double a, double b ) + { + return std::remainder( a, b ); + } + // remainder(a,b); + + // Compute the Courant length on a regular mesh + inline double courant_length( double lx, double ly, double lz, double nx, + double ny, double nz ) + { + double w0, w1 = 0; + if ( nx > 1 ) + w0 = nx / lx, w1 += w0 * w0; + if ( ny > 1 ) + w0 = ny / ly, w1 += w0 * w0; + if ( nz > 1 ) + w0 = nz / lz, w1 += w0 * w0; + return sqrt( 1 / w1 ); + } + + ////////////////////////////////////////////////////////// + // These friends are used by the checkpt / restore service + + friend void checkpt_vpic_simulation( const vpic_simulation* vpic ); + friend vpic_simulation* restore_vpic_simulation( void ); + friend void reanimate_vpic_simulation( vpic_simulation* vpic ); + + //////////////////////////////////////////////////////////// + // User input deck provided functions (see deck_wrapper.cxx) + + void user_initialization( int argc, char** argv ); + void user_particle_injection( void ); + void user_current_injection( void ); + void user_field_injection( void ); + void user_diagnostics( void ); + void user_particle_collisions( void ); }; #endif // vpic_h diff --git a/src/vpic/vpic_unit_deck.h b/src/vpic/vpic_unit_deck.h index 4ea4fdb0..e3af99df 100644 --- a/src/vpic/vpic_unit_deck.h +++ b/src/vpic/vpic_unit_deck.h @@ -3,8 +3,10 @@ #include "src/vpic/vpic.h" -void vpic_simulation::user_initialization(int num_cmdline_arguments, - char ** cmdline_arguments) {} +void vpic_simulation::user_initialization( int num_cmdline_arguments, + char** cmdline_arguments ) +{ +} void vpic_simulation::user_diagnostics() {}