lanl · JDTruj2018 · Dec 22, 2022 · Dec 19, 2022 · Dec 21, 2022
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Prototype testbed for various memory acceleration schemes focused on improving s
 ```
 mkdir build
 cd build
-cmake ..
+CC=icx cmake ..
 make
 ```
 
@@ -22,6 +22,30 @@ Terminal window 2
 ./tests/test_client
 ```
 
+On nodes with multiple CPU sockets, bandwidth can be drastically reduced if the client and controller processes are bound to different sockets. To explicitly bind the processes to the same socket, use the following:
+
+Terminal window 1
+```
+hwloc-bind socket:0 ./controller
+```
+
+Terminal window 2
+```
+hwloc-bind socket:0 ./tests/test_client
+```
+
+
+## Tests
+Tests for 0, 1, and 2 levels of indirection are implemented. They come in the following flavors:
+- `str` uses straight access, meaning index `a[i] = i` for all levels of indirection (this is the only test availalbe for 0 levels of indirection).
+- `A` or `noA` denotes if aliases are included or not. If aliases are included, they are added before the shuffle stage (see below). For each index, a random number is drawn and if it's below the alias fraction, this index is inserted at a random position in the indirection indices. This is done for all levels of indirection.
+- `F` or `C` denotes full or clustered shuffle and aliases. Full shuffle means the indices are shuffled across the entire range and aliases, if used, are inserted across the entire range. In clustered mode, the shuffle and aliasing happens only within consequtive clusters of the given size. For example, say we have a cluster size `S = 32`, then the first cluster is indices 0 - 31 and aliases are within this group are added and only these indices are shuffled amongst themselves. The next cluster is 32 - 63, and any aliases added to this cluster are all indices within this cluster before they are shuffled amongst themselves.
+
+Under the `tests` directory in the build directory, there are two executables:
+- `test` runs the test suite without using the client and controller infrastructure, it just tests the kernls directly
+- `test_client` runs the tests as a client and communicates with the controller, a controller must thus be running
+
+
 ## Roadmap
 Current Work:
 - [x] Request Ring Buffer
@@ -33,9 +57,9 @@ Current Work:
 - [x] Multi-threading
 - [x] Multi-client
 - [x] Integration with Spatter
+- [x] AVX intrinsics
 
 Future:
-- [ ] AVX intrinsics
 - [ ] SVE intrinsics
 - [ ] Read/Write Dependency Graphs
 - [ ] Run and Test scripts

diff --git a/include/shared/kernels.h b/include/shared/kernels.h
@@ -173,16 +173,24 @@ FORCE_INLINE void write_single_thread_0(double *buffer, const double *input,
 FORCE_INLINE void write_single_thread_1(double *buffer, const double *input,
                                         size_t N, const size_t *ind1,
                                         bool use_avx) {
-  for (size_t i = 0; i < N; ++i) {
-    buffer[ind1[i]] = input[i];
+  if (use_avx) {
+    WRITE_1_AVX(buffer, input, ind1, 0, N);
+  } else {
+    for (size_t i = 0; i < N; ++i) {
+      buffer[ind1[i]] = input[i];
+    }
   }
 }
 
 FORCE_INLINE void write_single_thread_2(double *buffer, const double *input,
                                         size_t N, const size_t *ind1,
                                         const size_t *ind2, bool use_avx) {
-  for (size_t i = 0; i < N; ++i) {
-    buffer[ind2[ind1[i]]] = input[i];
+  if (use_avx) {
+    WRITE_2_AVX(buffer, input, ind1, ind2, 0, N);
+  } else {
+    for (size_t i = 0; i < N; ++i) {
+      buffer[ind2[ind1[i]]] = input[i];
+    }
   }
 }
 
@@ -329,7 +337,8 @@ void *write_th_0_avx(void *args) {
 FORCE_INLINE void write_multi_thread_0(double *buffer, const double *input,
                                        size_t N, size_t n_threads,
                                        bool use_avx) {
-  THREAD_TEMPLATE(N, n_threads, write_th_args_0, write_th_0,
+  THREAD_TEMPLATE(N, n_threads, write_th_args_0,
+                  use_avx ? write_th_0_avx : write_th_0,
                   { args[i].input = input; });
 }
 
@@ -357,10 +366,11 @@ void *write_th_1_avx(void *args) {
 FORCE_INLINE
 void write_multi_thread_1(double *buffer, const double *input, size_t N,
                           const size_t *ind1, size_t n_threads, bool use_avx) {
-  THREAD_TEMPLATE(N, n_threads, write_th_args_1, write_th_1, {
-    args[i].input = input;
-    args[i].ind1 = ind1;
-  });
+  THREAD_TEMPLATE(N, n_threads, write_th_args_1,
+                  use_avx ? write_th_1_avx : write_th_1, {
+                    args[i].input = input;
+                    args[i].ind1 = ind1;
+                  });
 }
 
 struct write_th_args_2 {
@@ -388,9 +398,10 @@ FORCE_INLINE void write_multi_thread_2(double *buffer, const double *input,
                                        size_t N, const size_t *ind1,
                                        const size_t *ind2, size_t n_threads,
                                        bool use_avx) {
-  THREAD_TEMPLATE(N, n_threads, write_th_args_2, write_th_2, {
-    args[i].input = input;
-    args[i].ind1 = ind1;
-    args[i].ind2 = ind2;
-  });
+  THREAD_TEMPLATE(N, n_threads, write_th_args_2,
+                  use_avx ? write_th_2_avx : write_th_2, {
+                    args[i].input = input;
+                    args[i].ind1 = ind1;
+                    args[i].ind2 = ind2;
+                  });
 }
diff --git a/src/controller/controller_handle_requests.c b/src/controller/controller_handle_requests.c
@@ -204,6 +204,7 @@ void handle_requests(struct controller *controller) {
     args[i].controller = controller;
 
     int ret = pthread_create(&threads[i], NULL, handler, &args[i]);
+    (void)ret;
     assert(ret == 0);
   }