diff --git a/.gitignore b/.gitignore
index c6127b3..fd12ea5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,18 @@
+# Custom
+*~
+build*/
+debug*/
+scripts/
+compile_commands.json
+.clangd/
+shm_malloc/*.o
+shm_malloc/*.a
+shm_malloc/tanon
+shm_malloc/tshm1
+shm_malloc/tshm1db
+shm_malloc/tshm2
+shm_malloc/tshm2db
+
 # Prerequisites
 *.d
 
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..cbcc0ca
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "external/argtable3"]
+	path = external/argtable3
+	url = https://github.com/argtable/argtable3.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a622983
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,55 @@
+cmake_minimum_required(VERSION 3.10)
+project(memory-accelerator C)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_subdirectory(src)
+
+add_compile_options(-Wall -Wextra -pedantic -Wno-unused-function -Wno-strict-prototypes -g)
+
+# JL: Can't use STREQUAL here because the compiler ID could be "IntelLLVM"
+if (CMAKE_C_COMPILER_ID MATCHES "Intel")
+  add_compile_options(-mavx512f -Wno-debug-disables-optimization)
+else()
+  add_compile_options(-g)
+endif()
+
+set(CMAKE_C_STANDARD 11)
+
+include_directories(include)
+include_directories(include/controller)
+include_directories(include/client)
+include_directories(include/posix)
+include_directories(include/shared)
+include_directories(include/uthash)
+
+add_executable(client client.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES})
+target_link_libraries(client shm pthread rt)
+
+add_executable(controller controller.c ${CONTROLLER_SOURCE_FILES} ${SHARED_SOURCE_FILES})
+target_link_libraries(controller shm pthread rt)
+
+
+find_package(Git QUIET)
+if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
+# Update submodules as needed
+    option(GIT_SUBMODULE "Check submodules during build" ON)
+    if(GIT_SUBMODULE)
+        message(STATUS "Submodule update")
+        execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
+                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                        RESULT_VARIABLE GIT_SUBMOD_RESULT)
+        if(NOT GIT_SUBMOD_RESULT EQUAL "0")
+            message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
+        endif()
+    endif()
+endif()
+
+if(NOT EXISTS "${PROJECT_SOURCE_DIR}/external/argtable3/CMakeLists.txt")
+    message(FATAL_ERROR "The submodules were not downloaded! GIT_SUBMODULE was turned off or failed. Please update submodules and try again.")
+endif()
+
+add_subdirectory(external/argtable3)
+include_directories(external/argtable3/src)
+
+add_subdirectory(tests)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..84944d8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,71 @@
+# ScorIA
+
+## Description
+Prototype testbed for various memory acceleration schemes focused on improving sparse memory accesses and scatter/gather performance through indirection arrays.
+
+## Installation
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+## Usage
+Terminal window 1
+```
+./controller
+```
+
+Terminal window 2
+```
+./tests/test_client
+```
+
+## Roadmap
+Current Work:
+- [x] Request Ring Buffer
+- [x] Controller Request Handler
+- [x] Develop Initial Test Cases
+- [x] Implement Read, Write
+- [x] Asynchronous Requests
+- [x] Atomic, Serialized, and Parallel Writes (Aliasing)
+- [x] Multi-threading
+- [x] Multi-client
+- [x] Integration with Spatter
+
+Future:
+- [ ] AVX intrinsics
+- [ ] SVE intrinsics
+- [ ] Read/Write Dependency Graphs
+- [ ] Run and Test scripts
+
+Initial Requirements:
+- [x] Multi-Process (Separation between client and server)
+- [x] Server is the memory controller
+- [x] Memory controller needs to be multi-threaded
+- [x] Needs to be able to use vector load store (for optimized bandwidth)
+- [x] Needs an API for taking memory access requests
+- [x] Needs to run on CPU hardware
+- [ ] Needs to be numa aware
+- [ ] Start prototyping on SPR (DDR)
+- [x] Define some test program for driving this
+  - [x] allocaate A[], B[], C[]
+  - [x] fetch (A[B[C[i]]] for i = 0:1000
+  - [x] D[1000]
+- [x] Determine what we will use for programming language (C)
+  - [x] FPGA friendly
+- [ ] Modular SGDMA
+- [x] Determine the shared memory programming paradigm
+- [x] Specialized memory allocator
+  - [x] Everything in shared memory
+  - [x] A, B, and C need to be allocated in shared memory (shared between the two process spaces) 
+  - [x] These should probably be allocated at the same virtual address locations
+- [ ] Synchronous vs. Asynchronous
+  - [x] MPI_IReceive and MPI_Wait as inspiration 
+  - [x] Handles of some sort
+  - [x] Could also just be an update of the latest fetched index (in a prefetching workload) in a well-known mailbox
+
+## Authors and acknowledgment
+- Jered Dominguez-Trujillo jereddt@lanl.gov
+- Jonas Lippuner jlippuner@lanl.gov
diff --git a/client.c b/client.c
new file mode 100644
index 0000000..57bf6b2
--- /dev/null
+++ b/client.c
@@ -0,0 +1,26 @@
+#include <stdio.h>
+
+#include "client.h"
+#include "config.h"
+
+#include "client_cleanup.h"
+#include "client_init.h"
+#include "client_place_requests.h"
+
+int main(int argc, char **argv) {
+  // Suppress Compiler Warnings
+  (void)argc;
+  (void)argv;
+
+  struct client client;
+  client.chatty = 0;
+
+  init(&client);
+  place_requests(&client);
+
+  cleanup(&client);
+
+  printf("Exiting...\n");
+
+  return 0;
+}
diff --git a/controller.c b/controller.c
new file mode 100644
index 0000000..a706d06
--- /dev/null
+++ b/controller.c
@@ -0,0 +1,26 @@
+#include <stdio.h>
+
+#include "config.h"
+#include "controller.h"
+
+#include "controller_cleanup.h"
+#include "controller_handle_requests.h"
+#include "controller_init.h"
+
+int main(int argc, char **argv) {
+  // Suppress Compiler Warnings
+  (void)argc;
+  (void)argv;
+
+  struct controller controller;
+  controller.chatty = 1;
+
+  init(&controller);
+  handle_requests(&controller);
+
+  cleanup(&controller);
+
+  printf("Exiting...\n");
+
+  return 0;
+}
diff --git a/external/argtable3 b/external/argtable3
new file mode 160000
index 0000000..f46ef20
--- /dev/null
+++ b/external/argtable3
@@ -0,0 +1 @@
+Subproject commit f46ef208a7ebc941c9f56027112b6894baa4c3f9
diff --git a/include/client.h b/include/client.h
new file mode 100644
index 0000000..2a64bac
--- /dev/null
+++ b/include/client.h
@@ -0,0 +1,30 @@
+#ifndef CLIENT_H
+#define CLIENT_H
+
+#include <semaphore.h>
+
+#include "config.h"
+#include "request.h"
+
+struct client {
+  int id;
+
+  int fd_location;
+  int fd_requests;
+  int fd_completions;
+
+  int chatty;
+
+  struct memory_location *shared_location;
+
+  struct request_queue_list *shared_requests_list;
+  struct request_queue_list *shared_completions_list;
+  struct shared_memory *shared_mem_ptr;
+
+  struct request_queue *shared_requests;
+  struct request_queue *shared_completions;
+
+  struct request *unmatched_requests;
+};
+
+#endif /* CLIENT_H */
diff --git a/include/client/client_cleanup.h b/include/client/client_cleanup.h
new file mode 100644
index 0000000..ac57abb
--- /dev/null
+++ b/include/client/client_cleanup.h
@@ -0,0 +1,10 @@
+#ifndef CLIENT_CLEANUP
+#define CLIENT_CLEANUP
+
+#include "client.h"
+
+void cleanup_queues(struct client *client);
+void cleanup_shared_mem(struct client *client);
+void cleanup(struct client *client);
+
+#endif /* CLIENT_CLEANUP */
diff --git a/include/client/client_init.h b/include/client/client_init.h
new file mode 100644
index 0000000..d67ca6b
--- /dev/null
+++ b/include/client/client_init.h
@@ -0,0 +1,13 @@
+#ifndef CLIENT_INIT
+#define CLIENT_INIT
+
+#include "client.h"
+#include "request.h"
+
+void init_memory_pool(struct client *client);
+void init_requests(struct client *client);
+void init_completions(struct client *client);
+void init_id(struct client *client);
+void init(struct client *client);
+
+#endif /* CLIENT_INIT */
diff --git a/include/client/client_memory.h b/include/client/client_memory.h
new file mode 100644
index 0000000..898863b
--- /dev/null
+++ b/include/client/client_memory.h
@@ -0,0 +1,19 @@
+#ifndef CLIENT_MEMORY_H
+#define CLIENT_MEMORY_H
+
+#include <semaphore.h>
+
+#include "client.h"
+#include "request.h"
+
+void scoria_put_request(struct client *client, struct request *req);
+
+void scoria_quit(struct client *client, struct request *req);
+void scoria_read(struct client *client, void *buffer, const size_t N,
+                 void *output, const size_t *ind1, const size_t *ind2,
+                 size_t num_threads, bool use_avx, struct request *req);
+void scoria_write(struct client *client, void *buffer, const size_t N,
+                  void *input, const size_t *ind1, const size_t *ind2,
+                  size_t num_threads, bool use_avx, struct request *req);
+
+#endif /* CLIENT_MEMORY_H */
diff --git a/include/client/client_place_requests.h b/include/client/client_place_requests.h
new file mode 100644
index 0000000..1804b1f
--- /dev/null
+++ b/include/client/client_place_requests.h
@@ -0,0 +1,12 @@
+#ifndef CLIENT_PLACE_REQUESTS
+#define CLIENT_PLACE_REQUESTS
+
+#include "client.h"
+#include "request.h"
+
+void wait_request(struct client *client, struct request *req);
+void wait_requests(struct client *client, struct request *reqs,
+                   size_t num_reqs);
+void place_requests(struct client *client);
+
+#endif /* CLIENT_PLACE_REQUESTS */
diff --git a/include/client/client_read_location.h b/include/client/client_read_location.h
new file mode 100644
index 0000000..ce7c943
--- /dev/null
+++ b/include/client/client_read_location.h
@@ -0,0 +1,8 @@
+#ifndef CLIENT_READ_LOCATION
+#define CLIENT_READ_LOCATION
+
+#include "client.h"
+
+void read_location(struct client *client);
+
+#endif /* CLIENT_READ_LOCATION */
diff --git a/include/config.h b/include/config.h
new file mode 100644
index 0000000..0f1a67f
--- /dev/null
+++ b/include/config.h
@@ -0,0 +1,30 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#define SHARED_MEMORY_NAME "shared-mem"
+#define SHARED_LOCATION_NAME "/mem-controller-location"
+#define SHARED_REQUESTS_NAME "/mem-request-queue"
+#define SHARED_COMPLETIONS_NAME "/mem-completion-queue"
+
+#define REQUEST_QUEUE_SIZE 100
+#define MAX_CLIENTS 64
+
+struct list {
+  struct list *next;
+  int data;
+};
+
+struct shared_memory {
+  struct list *head;
+  struct list **tail;
+};
+
+struct memory_location {
+  int ready;
+
+  struct shared_memory *shared_mem_ptr;
+  struct request_queue_list *shared_requests_list;
+  struct request_queue_list *shared_completions_list;
+};
+
+#endif /* CONFIG_H */
diff --git a/include/controller.h b/include/controller.h
new file mode 100644
index 0000000..ada4ccc
--- /dev/null
+++ b/include/controller.h
@@ -0,0 +1,23 @@
+#ifndef CONTROLLER_H
+#define CONTROLLER_H
+
+#include <semaphore.h>
+
+#include "config.h"
+#include "request.h"
+
+struct controller {
+  int fd_location;
+  int fd_requests;
+  int fd_completions;
+
+  int chatty;
+
+  struct memory_location *shared_location;
+
+  struct request_queue_list *shared_requests_list;
+  struct request_queue_list *shared_completions_list;
+  struct shared_memory *shared_mem_ptr;
+};
+
+#endif /* CONTROLLER_H */
diff --git a/include/controller/controller_cleanup.h b/include/controller/controller_cleanup.h
new file mode 100644
index 0000000..b98eb61
--- /dev/null
+++ b/include/controller/controller_cleanup.h
@@ -0,0 +1,9 @@
+#ifndef CONTROLLER_CLEANUP
+#define CONTROLLER_CLEANUP
+
+#include "controller.h"
+
+void cleanup_shared_mem(struct controller *controller);
+void cleanup(struct controller *controller);
+
+#endif /* CONTROLLER_CLEANUP */
diff --git a/include/controller/controller_handle_requests.h b/include/controller/controller_handle_requests.h
new file mode 100644
index 0000000..5cc82ad
--- /dev/null
+++ b/include/controller/controller_handle_requests.h
@@ -0,0 +1,20 @@
+#ifndef CONTROLLER_HANDLE_REQUESTS
+#define CONTROLLER_HANDLE_REQUESTS
+
+#include "controller.h"
+#include "request.h"
+
+struct thread_args {
+  size_t i;
+  struct controller *controller;
+};
+
+void handle_read(struct controller *controller, struct request_queue *queue,
+                 struct request *req);
+void handle_write(struct controller *controller, struct request_queue *queue,
+                  struct request *req);
+
+void *handler(void *args);
+void handle_requests(struct controller *controller);
+
+#endif /* CONTROLLER_HANDLE_REQUESTS */
diff --git a/include/controller/controller_init.h b/include/controller/controller_init.h
new file mode 100644
index 0000000..9776f01
--- /dev/null
+++ b/include/controller/controller_init.h
@@ -0,0 +1,13 @@
+#ifndef CONTROLLER_INIT
+#define CONTROLLER_INIT
+
+#include "controller.h"
+#include "request.h"
+
+void init_files();
+void init_memory_pool(struct controller *controller);
+void init_requests(struct controller *controller);
+void init_completions(struct controller *controller);
+void init(struct controller *controller);
+
+#endif /* CONTROLLER_INIT */
diff --git a/include/controller/controller_write_location.h b/include/controller/controller_write_location.h
new file mode 100644
index 0000000..cb237a3
--- /dev/null
+++ b/include/controller/controller_write_location.h
@@ -0,0 +1,8 @@
+#ifndef CONTROLLER_WRITE_LOCATION
+#define CONTROLLER_WRITE_LOCATION
+
+#include "controller.h"
+
+void write_location(struct controller *controller);
+
+#endif /* CONTROLLER_WRITE_LOCATION */
diff --git a/include/posix/posix_sm.h b/include/posix/posix_sm.h
new file mode 100644
index 0000000..cde60fb
--- /dev/null
+++ b/include/posix/posix_sm.h
@@ -0,0 +1,19 @@
+#ifndef POSIX_SM_H
+#define POSIX_SM_H
+
+#include <stdio.h>
+#include <sys/stat.h>
+
+int scoria_sm_open(const char *name, int oflag, mode_t mode, const char *msg);
+
+void scoria_sm_unlink(const char *name, const char *msg);
+
+void scoria_sm_truncate(const int fd, const size_t length, const char *msg);
+
+void *scoria_sm_map(void *addr, const size_t length, const int prot,
+                    const int flags, const int fd, const off_t offset,
+                    const char *msg);
+
+void scoria_sm_unmap(void *ptr, const size_t length, const char *msg);
+
+#endif /* POSIX_SM_H */
diff --git a/include/shared/backend-support-tests.h b/include/shared/backend-support-tests.h
new file mode 100644
index 0000000..aaa1467
--- /dev/null
+++ b/include/shared/backend-support-tests.h
@@ -0,0 +1,49 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+#ifndef BACKEND_SUPPORT_TESTS_H
+#define BACKEND_SUPPORT_TESTS_H
+int sg_cuda_support();
+int sg_opencl_support();
+int sg_openmp_support();
+int sg_serial_support();
+#endif
diff --git a/include/shared/json.h b/include/shared/json.h
new file mode 100644
index 0000000..25628c3
--- /dev/null
+++ b/include/shared/json.h
@@ -0,0 +1,243 @@
+
+/* vim: set et ts=3 sw=3 sts=3 ft=c:
+ *
+ * Copyright (C) 2012, 2013, 2014 James McLaughlin et al.  All rights reserved.
+ * https://github.com/udp/json-parser
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _JSON_H
+#define _JSON_H
+
+#ifndef json_char
+#define json_char char
+#endif
+
+#ifndef json_int_t
+#ifndef _MSC_VER
+#include <inttypes.h>
+#define json_int_t int64_t
+#else
+#define json_int_t __int64
+#endif
+#endif
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+
+#include <string.h>
+
+extern "C" {
+
+#endif
+
+typedef struct {
+  unsigned long max_memory;
+  int settings;
+
+  /* Custom allocator support (leave null to use malloc/free)
+   */
+
+  void *(*mem_alloc)(size_t, int zero, void *user_data);
+  void (*mem_free)(void *, void *user_data);
+
+  void *user_data; /* will be passed to mem_alloc and mem_free */
+
+  size_t value_extra; /* how much extra space to allocate for values? */
+
+} json_settings;
+
+#define json_enable_comments 0x01
+
+typedef enum {
+  json_none,
+  json_object,
+  json_array,
+  json_integer,
+  json_double,
+  json_string,
+  json_boolean,
+  json_null
+
+} json_type;
+
+extern const struct _json_value json_value_none;
+
+typedef struct _json_object_entry {
+  json_char *name;
+  unsigned int name_length;
+
+  struct _json_value *value;
+
+} json_object_entry;
+
+typedef struct _json_value {
+  struct _json_value *parent;
+
+  json_type type;
+
+  union {
+    int boolean;
+    json_int_t integer;
+    double dbl;
+
+    struct {
+      unsigned int length;
+      json_char *ptr; /* null terminated */
+
+    } string;
+
+    struct {
+      unsigned int length;
+
+      json_object_entry *values;
+
+#if defined(__cplusplus) && __cplusplus >= 201103L
+      decltype(values) begin() const { return values; }
+      decltype(values) end() const { return values + length; }
+#endif
+
+    } object;
+
+    struct {
+      unsigned int length;
+      struct _json_value **values;
+
+#if defined(__cplusplus) && __cplusplus >= 201103L
+      decltype(values) begin() const { return values; }
+      decltype(values) end() const { return values + length; }
+#endif
+
+    } array;
+
+  } u;
+
+  union {
+    struct _json_value *next_alloc;
+    void *object_mem;
+
+  } _reserved;
+
+#ifdef JSON_TRACK_SOURCE
+
+  /* Location of the value in the source JSON
+   */
+  unsigned int line, col;
+
+#endif
+
+  /* Some C++ operator sugar */
+
+#ifdef __cplusplus
+
+public:
+  inline _json_value() { memset(this, 0, sizeof(_json_value)); }
+
+  inline const struct _json_value &operator[](int index) const {
+    if (type != json_array || index < 0 ||
+        ((unsigned int)index) >= u.array.length) {
+      return json_value_none;
+    }
+
+    return *u.array.values[index];
+  }
+
+  inline const struct _json_value &operator[](const char *index) const {
+    if (type != json_object)
+      return json_value_none;
+
+    for (unsigned int i = 0; i < u.object.length; ++i)
+      if (!strcmp(u.object.values[i].name, index))
+        return *u.object.values[i].value;
+
+    return json_value_none;
+  }
+
+  inline operator const char *() const {
+    switch (type) {
+    case json_string:
+      return u.string.ptr;
+
+    default:
+      return "";
+    };
+  }
+
+  inline operator json_int_t() const {
+    switch (type) {
+    case json_integer:
+      return u.integer;
+
+    case json_double:
+      return (json_int_t)u.dbl;
+
+    default:
+      return 0;
+    };
+  }
+
+  inline operator bool() const {
+    if (type != json_boolean)
+      return false;
+
+    return u.boolean != 0;
+  }
+
+  inline operator double() const {
+    switch (type) {
+    case json_integer:
+      return (double)u.integer;
+
+    case json_double:
+      return u.dbl;
+
+    default:
+      return 0;
+    };
+  }
+
+#endif
+
+} json_value;
+
+json_value *json_parse(const json_char *json, size_t length);
+
+#define json_error_max 128
+json_value *json_parse_ex(json_settings *settings, const json_char *json,
+                          size_t length, char *error);
+
+void json_value_free(json_value *);
+
+/* Not usually necessary, unless you used a custom mem_alloc and now want to
+ * use a custom mem_free.
+ */
+void json_value_free_ex(json_settings *settings, json_value *);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/include/shared/kernels.h b/include/shared/kernels.h
new file mode 100644
index 0000000..e3944a5
--- /dev/null
+++ b/include/shared/kernels.h
@@ -0,0 +1,396 @@
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <immintrin.h>
+
+static_assert(sizeof(size_t) == 8, "size_t is expected to be a 64-bit integer");
+
+#define FORCE_INLINE __attribute__((always_inline)) static inline
+
+// ===========================================================================
+// AVX KERNELS
+// ===========================================================================
+
+#define READ_0_AVX(buffer, res, start, end)                                    \
+  /* process in chunks of 8 elements */                                        \
+  __m512d vals;                                                                \
+                                                                               \
+  size_t idx = start;                                                          \
+  /* subtract 7 from end to not go over, we'll deal with leftovers below */    \
+  for (; idx < end - 7; idx += 8) {                                            \
+    vals = _mm512_load_pd(buffer + idx);                                       \
+    _mm512_store_pd(res + idx, vals);                                          \
+  }                                                                            \
+                                                                               \
+  /* deal with leftovers */                                                    \
+  for (; idx < end; ++idx) {                                                   \
+    res[idx] = buffer[idx];                                                    \
+  }
+
+#define READ_1_AVX(buffer, res, ind, start, end)                               \
+  /* process in chunks of 8 elements */                                        \
+  __m512d vals;                                                                \
+  __m512i indices;                                                             \
+                                                                               \
+  size_t idx = start;                                                          \
+  /* subtract 7 from end to not go over, we'll deal with leftovers below */    \
+  for (; idx < end - 7; idx += 8) {                                            \
+    indices = _mm512_load_epi64(ind + idx);                                    \
+    vals = _mm512_i64gather_pd(indices, buffer, 8);                            \
+    _mm512_store_pd(res + idx, vals);                                          \
+  }                                                                            \
+                                                                               \
+  /* deal with leftovers */                                                    \
+  for (; idx < end; ++idx) {                                                   \
+    res[idx] = buffer[ind[idx]];                                               \
+  }
+
+#define READ_2_AVX(buffer, res, ind1, ind2, start, end)                        \
+  /* process in chunks of 8 elements */                                        \
+  __m512d vals;                                                                \
+  __m512i indices1, indices2;                                                  \
+                                                                               \
+  size_t idx = start;                                                          \
+  /* subtract 7 from end to not go over, we'll deal with leftovers below */    \
+  for (; idx < end - 7; idx += 8) {                                            \
+    indices1 = _mm512_load_epi64(ind1 + idx);                                  \
+    indices2 = _mm512_i64gather_epi64(indices1, ind2, 8);                      \
+    vals = _mm512_i64gather_pd(indices2, buffer, 8);                           \
+    _mm512_store_pd(res + idx, vals);                                          \
+  }                                                                            \
+                                                                               \
+  /* deal with leftovers */                                                    \
+  for (; idx < end; ++idx) {                                                   \
+    res[idx] = buffer[ind2[ind1[idx]]];                                        \
+  }
+
+#define WRITE_0_AVX(buffer, input, start, end)                                 \
+  /* process in chunks of 8 elements */                                        \
+  __m512d vals;                                                                \
+                                                                               \
+  size_t idx = start;                                                          \
+  /* subtract 7 from end to not go over, we'll deal with leftovers below */    \
+  for (; idx < end - 7; idx += 8) {                                            \
+    vals = _mm512_load_pd(input + idx);                                        \
+    _mm512_store_pd(buffer + idx, vals);                                       \
+  }                                                                            \
+                                                                               \
+  /* deal with leftovers */                                                    \
+  for (; idx < end; ++idx) {                                                   \
+    buffer[idx] = input[idx];                                                  \
+  }
+
+#define WRITE_1_AVX(buffer, input, ind, start, end)                            \
+  /* process in chunks of 8 elements */                                        \
+  __m512d vals;                                                                \
+  __m512i indices;                                                             \
+                                                                               \
+  size_t idx = start;                                                          \
+  /* subtract 7 from end to not go over, we'll deal with leftovers below */    \
+  for (; idx < end - 7; idx += 8) {                                            \
+    indices = _mm512_load_epi64(ind + idx);                                    \
+    vals = _mm512_load_pd(input + idx);                                        \
+    _mm512_i64scatter_pd(buffer, indices, vals, 8);                            \
+  }                                                                            \
+                                                                               \
+  /* deal with leftovers */                                                    \
+  for (; idx < end; ++idx) {                                                   \
+    buffer[ind[idx]] = input[idx];                                             \
+  }
+
+#define WRITE_2_AVX(buffer, input, ind1, ind2, start, end)                     \
+  /* process in chunks of 8 elements */                                        \
+  __m512d vals;                                                                \
+  __m512i indices1, indices2;                                                  \
+                                                                               \
+  size_t idx = start;                                                          \
+  /* subtract 7 from end to not go over, we'll deal with leftovers below */    \
+  for (; idx < end - 7; idx += 8) {                                            \
+    indices1 = _mm512_load_epi64(ind1 + idx);                                  \
+    indices2 = _mm512_i64gather_epi64(indices1, ind2, 8);                      \
+    vals = _mm512_load_pd(input + idx);                                        \
+    _mm512_i64scatter_pd(buffer, indices2, vals, 8);                           \
+  }                                                                            \
+                                                                               \
+  /* deal with leftovers */                                                    \
+  for (; idx < end; ++idx) {                                                   \
+    buffer[ind2[ind1[idx]]] = input[idx];                                      \
+  }
+
+// ===========================================================================
+// SINGLE THREADED
+// ===========================================================================
+
+FORCE_INLINE void read_single_thread_0(double *res, const double *buffer,
+                                       size_t N, bool use_avx) {
+  if (use_avx) {
+    READ_0_AVX(buffer, res, 0, N)
+  } else {
+    for (size_t i = 0; i < N; ++i) {
+      res[i] = buffer[i];
+    }
+  }
+}
+
+FORCE_INLINE void read_single_thread_1(double *res, const double *buffer,
+                                       size_t N, const size_t *ind1,
+                                       bool use_avx) {
+  if (use_avx) {
+    READ_1_AVX(buffer, res, ind1, 0, N)
+  } else {
+    for (size_t i = 0; i < N; ++i) {
+      res[i] = buffer[ind1[i]];
+    }
+  }
+}
+
+FORCE_INLINE void read_single_thread_2(double *res, const double *buffer,
+                                       size_t N, const size_t *ind1,
+                                       const size_t *ind2, bool use_avx) {
+  if (use_avx) {
+    READ_2_AVX(buffer, res, ind1, ind2, 0, N)
+  } else {
+    for (size_t i = 0; i < N; ++i) {
+      res[i] = buffer[ind2[ind1[i]]];
+    }
+  }
+}
+
+FORCE_INLINE void write_single_thread_0(double *buffer, const double *input,
+                                        size_t N, bool use_avx) {
+  if (use_avx) {
+    WRITE_0_AVX(buffer, input, 0, N);
+  } else {
+    for (size_t i = 0; i < N; ++i) {
+      buffer[i] = input[i];
+    }
+  }
+}
+
+FORCE_INLINE void write_single_thread_1(double *buffer, const double *input,
+                                        size_t N, const size_t *ind1,
+                                        bool use_avx) {
+  for (size_t i = 0; i < N; ++i) {
+    buffer[ind1[i]] = input[i];
+  }
+}
+
+FORCE_INLINE void write_single_thread_2(double *buffer, const double *input,
+                                        size_t N, const size_t *ind1,
+                                        const size_t *ind2, bool use_avx) {
+  for (size_t i = 0; i < N; ++i) {
+    buffer[ind2[ind1[i]]] = input[i];
+  }
+}
+
+// ===========================================================================
+// MULTI THREADED
+// ===========================================================================
+
+#define MIN(x, y) (x < y ? x : y)
+#define MAX(x, y) (x > y ? x : y)
+
+#define THREAD_TEMPLATE(N, n_threads, thread_args, thread_func,                \
+                        extra_args_setup)                                      \
+  size_t chunk_size = (N + n_threads - 1) / n_threads; /* round up */          \
+                                                                               \
+  pthread_t threads[n_threads];                                                \
+  struct thread_args args[n_threads];                                          \
+                                                                               \
+  for (size_t i = 0; i < n_threads; ++i) {                                     \
+    args[i].buffer = buffer;                                                   \
+    args[i].start = i * chunk_size;                                            \
+    args[i].end = MIN((i + 1) * chunk_size, N);                                \
+    extra_args_setup;                                                          \
+                                                                               \
+    int ret = pthread_create(&threads[i], NULL, thread_func, &args[i]);        \
+    (void)ret;                                                                 \
+    assert(ret == 0);                                                          \
+  }                                                                            \
+                                                                               \
+  for (size_t i = 0; i < n_threads; ++i) {                                     \
+    pthread_join(threads[i], NULL);                                            \
+  }
+
+struct read_th_args_0 {
+  double *res;
+  const double *buffer;
+  size_t start, end;
+};
+
+void *read_th_0(void *args) {
+  struct read_th_args_0 *a = args;
+  for (size_t i = a->start; i < a->end; ++i) {
+    a->res[i] = a->buffer[i];
+  }
+  return NULL;
+}
+
+void *read_th_0_avx(void *args) {
+  struct read_th_args_0 *a = args;
+  READ_0_AVX(a->buffer, a->res, a->start, a->end)
+  return NULL;
+}
+
+FORCE_INLINE void read_multi_thread_0(double *res, const double *buffer,
+                                      size_t N, size_t n_threads,
+                                      bool use_avx) {
+  THREAD_TEMPLATE(N, n_threads, read_th_args_0,
+                  use_avx ? read_th_0_avx : read_th_0, { args[i].res = res; });
+}
+
+struct read_th_args_1 {
+  double *res;
+  const double *buffer;
+  size_t start, end;
+  const size_t *ind1;
+};
+
+void *read_th_1(void *args) {
+  struct read_th_args_1 *a = args;
+  for (size_t i = a->start; i < a->end; ++i) {
+    a->res[i] = a->buffer[a->ind1[i]];
+  }
+  return NULL;
+}
+
+void *read_th_1_avx(void *args) {
+  struct read_th_args_1 *a = args;
+  READ_1_AVX(a->buffer, a->res, a->ind1, a->start, a->end)
+  return NULL;
+}
+
+FORCE_INLINE void read_multi_thread_1(double *res, const double *buffer,
+                                      size_t N, const size_t *ind1,
+                                      size_t n_threads, bool use_avx) {
+  THREAD_TEMPLATE(N, n_threads, read_th_args_1,
+                  use_avx ? read_th_1_avx : read_th_1, {
+                    args[i].res = res;
+                    args[i].ind1 = ind1;
+                  });
+}
+
+struct read_th_args_2 {
+  double *res;
+  const double *buffer;
+  size_t start, end;
+  const size_t *ind1, *ind2;
+};
+
+void *read_th_2(void *args) {
+  struct read_th_args_2 *a = args;
+  for (size_t i = a->start; i < a->end; ++i) {
+    a->res[i] = a->buffer[a->ind2[a->ind1[i]]];
+  }
+  return NULL;
+}
+
+void *read_th_2_avx(void *args) {
+  struct read_th_args_2 *a = args;
+  READ_2_AVX(a->buffer, a->res, a->ind1, a->ind2, a->start, a->end)
+  return NULL;
+}
+
+FORCE_INLINE void read_multi_thread_2(double *res, const double *buffer,
+                                      size_t N, const size_t *ind1,
+                                      const size_t *ind2, size_t n_threads,
+                                      bool use_avx) {
+  THREAD_TEMPLATE(N, n_threads, read_th_args_2,
+                  use_avx ? read_th_2_avx : read_th_2, {
+                    args[i].res = res;
+                    args[i].ind1 = ind1;
+                    args[i].ind2 = ind2;
+                  });
+}
+
+struct write_th_args_0 {
+  double *buffer;
+  const double *input;
+  size_t start, end;
+};
+
+void *write_th_0(void *args) {
+  struct write_th_args_0 *a = args;
+  for (size_t i = a->start; i < a->end; ++i) {
+    a->buffer[i] = a->input[i];
+  }
+  return NULL;
+}
+
+void *write_th_0_avx(void *args) {
+  struct write_th_args_0 *a = args;
+  WRITE_0_AVX(a->buffer, a->input, a->start, a->end)
+  return NULL;
+}
+
+FORCE_INLINE void write_multi_thread_0(double *buffer, const double *input,
+                                       size_t N, size_t n_threads,
+                                       bool use_avx) {
+  THREAD_TEMPLATE(N, n_threads, write_th_args_0, write_th_0,
+                  { args[i].input = input; });
+}
+
+struct write_th_args_1 {
+  double *buffer;
+  const double *input;
+  size_t start, end;
+  const size_t *ind1;
+};
+
+void *write_th_1(void *args) {
+  struct write_th_args_1 *a = args;
+  for (size_t i = a->start; i < a->end; ++i) {
+    a->buffer[a->ind1[i]] = a->input[i];
+  }
+  return NULL;
+}
+
+void *write_th_1_avx(void *args) {
+  struct write_th_args_1 *a = args;
+  WRITE_1_AVX(a->buffer, a->input, a->ind1, a->start, a->end)
+  return NULL;
+}
+
+FORCE_INLINE
+void write_multi_thread_1(double *buffer, const double *input, size_t N,
+                          const size_t *ind1, size_t n_threads, bool use_avx) {
+  THREAD_TEMPLATE(N, n_threads, write_th_args_1, write_th_1, {
+    args[i].input = input;
+    args[i].ind1 = ind1;
+  });
+}
+
+struct write_th_args_2 {
+  double *buffer;
+  const double *input;
+  size_t start, end;
+  const size_t *ind1, *ind2;
+};
+
+void *write_th_2(void *args) {
+  struct write_th_args_2 *a = args;
+  for (size_t i = a->start; i < a->end; ++i) {
+    a->buffer[a->ind2[a->ind1[i]]] = a->input[i];
+  }
+  return NULL;
+}
+
+void *write_th_2_avx(void *args) {
+  struct write_th_args_2 *a = args;
+  WRITE_2_AVX(a->buffer, a->input, a->ind1, a->ind2, a->start, a->end)
+  return NULL;
+}
+
+FORCE_INLINE void write_multi_thread_2(double *buffer, const double *input,
+                                       size_t N, const size_t *ind1,
+                                       const size_t *ind2, size_t n_threads,
+                                       bool use_avx) {
+  THREAD_TEMPLATE(N, n_threads, write_th_args_2, write_th_2, {
+    args[i].input = input;
+    args[i].ind1 = ind1;
+    args[i].ind2 = ind2;
+  });
+}
diff --git a/include/shared/parse-args.h b/include/shared/parse-args.h
new file mode 100644
index 0000000..3eb6864
--- /dev/null
+++ b/include/shared/parse-args.h
@@ -0,0 +1,167 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+
+/** @file parse-args.h
+ *  @author Patrick Lavin
+ *  @brief Provides a function to read CLI
+ */
+
+#ifndef PARSE_ARGS_H
+#define PARSE_ARGS_H
+
+#define WARN 0
+#define ERROR 1
+
+#define STRING_SIZE 1000000
+#define MAX_PATTERN_LEN 1048576
+
+#include <sgtype.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+/** @brief Supported benchmark backends
+ */
+enum sg_backend {
+  OPENCL,         /**< OpenCL Backend */
+  OPENMP,         /**< OpenMP CPU Backend */
+  CUDA,           /**< CUDA Backend */
+  SERIAL,         /**< SERIAL Backend */
+  INVALID_BACKEND /**< Used as a default backend */
+};
+
+enum sg_kernel {
+  INVALID_KERNEL = 0,
+  SCATTER,
+  GATHER,
+  GS,
+};
+
+enum sg_op { OP_COPY, OP_ACCUM, INVALID_OP };
+
+// Specifies the indexing or offset type
+enum idx_type {
+  UNIFORM,
+  MS1,
+  LAPLACIAN,
+  CUSTOM,
+  CONFIG_FILE,
+  XKP,
+  INVALID_IDX
+};
+
+/*
+enum state
+{
+    NOTRUN,
+    INVALID_STATE,
+    VALID_STATE
+};
+*/
+
+struct run_config {
+  // keep arrays at top so they are aligned
+  spIdx_t *pattern;
+  spIdx_t *pattern_gather;
+  spIdx_t *pattern_scatter;
+  size_t *deltas;
+  size_t *deltas_ps;
+  size_t *deltas_gather;
+  size_t *deltas_gather_ps;
+  size_t *deltas_scatter;
+  size_t *deltas_scatter_ps;
+  spSize_t pattern_len;
+  spSize_t pattern_gather_len;
+  spSize_t pattern_scatter_len;
+  ssize_t delta;
+  size_t deltas_len;
+  ssize_t delta_gather;
+  size_t deltas_gather_len;
+  ssize_t delta_scatter;
+  size_t deltas_scatter_len;
+  enum sg_kernel kernel;
+  enum idx_type type;
+  enum idx_type type_gather;
+  enum idx_type type_scatter;
+  spSize_t generic_len;
+  size_t wrap;
+  size_t nruns;
+  char pattern_file[STRING_SIZE];
+  char generator[STRING_SIZE];
+  char name[STRING_SIZE];
+  size_t random_seed;
+  size_t omp_threads;
+  enum sg_op op;
+  size_t vector_len;
+  unsigned int shmem;
+  size_t local_work_size;
+  double *time_ms;
+  long long **papi_ctr;
+  int papi_counters;
+  int stride_kernel;
+  // Reorder based kernels
+  int ro_morton;
+  int ro_hilbert;
+  int ro_block;
+  uint32_t *ro_order;
+  uint32_t *ro_order_dev;
+};
+
+struct backend_config {
+  enum sg_backend backend;
+  enum sg_kernel kernel;
+  enum sg_op op;
+
+  char platform_string[STRING_SIZE];
+  char device_string[STRING_SIZE];
+  char kernel_file[STRING_SIZE];
+  char kernel_name[STRING_SIZE];
+};
+
+/** @brief Read command-line arguments and populate global variables.
+ *  @param argc Value passed to main
+ *  @param argv Value passed to main
+ */
+void parse_args(int argc, char **argv, int *nrc, struct run_config **rc);
+struct run_config *parse_runs(int arrr, char **argv);
+void error(char *what, int code);
+void print_run_config(struct run_config rc);
+#endif
diff --git a/include/shared/pcg_basic.h b/include/shared/pcg_basic.h
new file mode 100644
index 0000000..6a47067
--- /dev/null
+++ b/include/shared/pcg_basic.h
@@ -0,0 +1,78 @@
+/*
+ * PCG Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This code is derived from the full C implementation, which is in turn
+ * derived from the canonical C++ PCG implementation. The C++ version
+ * has many additional features and is preferable if you can use C++ in
+ * your project.
+ */
+
+#ifndef PCG_BASIC_H_INCLUDED
+#define PCG_BASIC_H_INCLUDED 1
+
+#include <inttypes.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct pcg_state_setseq_64 { // Internals are *Private*.
+  uint64_t state;            // RNG state.  All values are possible.
+  uint64_t inc;              // Controls which RNG sequence (stream) is
+                             // selected. Must *always* be odd.
+};
+typedef struct pcg_state_setseq_64 pcg32_random_t;
+
+// If you *must* statically initialize it, here's one.
+
+#define PCG32_INITIALIZER                                                      \
+  { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL }
+
+// pcg32_srandom(initstate, initseq)
+// pcg32_srandom_r(rng, initstate, initseq):
+//     Seed the rng.  Specified in two parts, state initializer and a
+//     sequence selection constant (a.k.a. stream id)
+
+void pcg32_srandom(uint64_t initstate, uint64_t initseq);
+void pcg32_srandom_r(pcg32_random_t *rng, uint64_t initstate, uint64_t initseq);
+
+// pcg32_random()
+// pcg32_random_r(rng)
+//     Generate a uniformly distributed 32-bit random number
+
+uint32_t pcg32_random(void);
+uint32_t pcg32_random_r(pcg32_random_t *rng);
+
+// pcg32_boundedrand(bound):
+// pcg32_boundedrand_r(rng, bound):
+//     Generate a uniformly distributed number, r, where 0 <= r < bound
+
+uint32_t pcg32_boundedrand(uint32_t bound);
+uint32_t pcg32_boundedrand_r(pcg32_random_t *rng, uint32_t bound);
+
+#if __cplusplus
+}
+#endif
+
+#endif // PCG_BASIC_H_INCLUDED
diff --git a/include/shared/request.h b/include/shared/request.h
new file mode 100644
index 0000000..1deab0e
--- /dev/null
+++ b/include/shared/request.h
@@ -0,0 +1,79 @@
+#ifndef REQUEST_H
+#define REQUEST_H
+
+#include "uthash.h"
+
+#include "config.h"
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+typedef enum { Read, Write, Quit, Kill } request_type;
+typedef enum { Waiting, Ready } request_status;
+
+struct request {
+  int client;
+  int id;
+
+  request_type r_type;
+  request_status r_status;
+  size_t size;
+
+  size_t N;
+
+  const void *input;
+  void *output;
+
+  const size_t *ind1;
+  const size_t *ind2;
+  size_t nthreads;
+  bool use_avx;
+
+  size_t offset;
+  double value;
+
+  UT_hash_handle hh;
+};
+
+typedef struct request_queue {
+  int client;
+  int active;
+
+  struct request requests[REQUEST_QUEUE_SIZE];
+
+  struct request *head;
+  struct request *tail;
+
+  size_t capacity;
+  size_t count;
+  size_t size;
+
+  struct request *begin;
+  struct request *end;
+
+  pthread_mutexattr_t attr_lock;
+  pthread_condattr_t attr_empty;
+  pthread_condattr_t attr_fill;
+
+  pthread_mutex_t lock;
+  pthread_cond_t empty, fill;
+} request_queue;
+
+void request_queue_init(request_queue *rq);
+void request_queue_free(request_queue *rq);
+
+void request_queue_put(request_queue *rq, const struct request *item);
+void request_queue_fetch(request_queue *rq, struct request *item);
+
+void request_queue_activate(request_queue *rq, int id);
+void request_queue_deactivate(request_queue *rq);
+
+typedef struct request_queue_list {
+  struct request_queue queues[MAX_CLIENTS];
+} request_queue_list;
+
+void request_queue_list_init(request_queue_list *rql);
+void request_queue_list_free(request_queue_list *rql);
+
+#endif /* REQUEST_H */
diff --git a/include/shared/sgtype.h b/include/shared/sgtype.h
new file mode 100644
index 0000000..1459d19
--- /dev/null
+++ b/include/shared/sgtype.h
@@ -0,0 +1,80 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+
+/** @file sgtype.h
+ *  @author Patrick Lavin
+ *  @brief A simple include file which can be edited to change the data type
+ */
+
+#ifndef SGTYPE_H
+#define SGTYPE_H
+#include <assert.h>
+#include <stddef.h>
+
+#ifdef USE_OPENCL
+#include "cl-helper.h"
+static_assert(sizeof(cl_ulong) == sizeof(unsigned long),
+              "Due to size differences between cl_ulong and unsigned long, we "
+              "cannot compile with OpenCL support on your system");
+static_assert(sizeof(cl_double) == sizeof(double),
+              "Due to size differences between cl_double and double, we cannot "
+              "compile with OpenCL support on your system");
+static_assert(sizeof(cl_uint) == sizeof(unsigned int),
+              "Due to size differences between cl_uint and unsigned int, we "
+              "cannot compile with OpenCL support on your system");
+static_assert(sizeof(cl_float) == sizeof(float),
+              "Due to size differences between cl_double and double, we cannot "
+              "compile with OpenCL support on your system");
+#endif
+
+typedef double sgData_t;
+#define SGD "%lf"
+typedef unsigned long sgIdx_t;
+typedef unsigned long spIdx_t;
+#define SGI "%lu"
+typedef long sgsIdx_t;
+#define SGS "%ld"
+
+typedef size_t spSize_t;
+#define SPS "%zu"
+
+#endif // endif SGTYPE
diff --git a/include/shared/sp_alloc.h b/include/shared/sp_alloc.h
new file mode 100644
index 0000000..4af6090
--- /dev/null
+++ b/include/shared/sp_alloc.h
@@ -0,0 +1,57 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+
+#ifndef SP_ALLOC_H
+#define SP_ALLOC_H
+
+#include <stddef.h>
+
+#ifndef SP_MAX_ALLOC
+// 65GB
+#define SP_MAX_ALLOC (65ll * 1000 * 1000 * 1000)
+#endif
+#define ALIGN_CACHE 64
+#define ALIGN_PAGE 4096
+void *sp_malloc(size_t size, size_t count, size_t align);
+void *sp_calloc(size_t size, size_t count, size_t align);
+long long get_mem_used();
+#endif
diff --git a/include/shared/utils.h b/include/shared/utils.h
new file mode 100644
index 0000000..cbdf085
--- /dev/null
+++ b/include/shared/utils.h
@@ -0,0 +1,7 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+void setup();
+void scoria_error(const char *msg);
+
+#endif /* UTILS_H */
diff --git a/include/uthash/uthash.h b/include/uthash/uthash.h
new file mode 100644
index 0000000..83ead39
--- /dev/null
+++ b/include/uthash/uthash.h
@@ -0,0 +1,1248 @@
+/*
+ * Copyright (c) 2003-2022, Troy D. Hanson https://troydhanson.github.io/uthash/
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *           notice, this list of conditions and the following disclaimer.
+ *           THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *           */
+
+#ifndef UTHASH_H
+#define UTHASH_H
+
+#define UTHASH_VERSION 2.3.0
+
+#include <stddef.h> /* ptrdiff_t */
+#include <stdlib.h> /* exit */
+#include <string.h> /* memcmp, memset, strlen */
+
+#if defined(HASH_DEFINE_OWN_STDINT) && HASH_DEFINE_OWN_STDINT
+/* This codepath is provided for backward compatibility, but I plan to remove
+ * it. */
+#warning                                                                       \
+    "HASH_DEFINE_OWN_STDINT is deprecated; please use HASH_NO_STDINT instead"
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+#elif defined(HASH_NO_STDINT) && HASH_NO_STDINT
+#else
+#include <stdint.h> /* uint8_t, uint32_t */
+#endif
+
+/* These macros use decltype or the earlier __typeof GNU extension.
+ *    As decltype is only available in newer compilers (VS2010 or gcc 4.3+
+ *       when compiling c++ source) this code uses whatever method is needed
+ *          or, for VS2008 where neither is available, uses casting workarounds.
+ */
+#if !defined(DECLTYPE) && !defined(NO_DECLTYPE)
+#if defined(_MSC_VER)                        /* MS compiler */
+#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */
+#define DECLTYPE(x) (decltype(x))
+#else /* VS2008 or older (or VS2010 in C mode) */
+#define NO_DECLTYPE
+#endif
+#elif defined(__MCST__) /* Elbrus C Compiler */
+#define DECLTYPE(x) (__typeof(x))
+#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) ||      \
+    defined(__WATCOMC__)
+#define NO_DECLTYPE
+#else /* GNU, Sun and other compilers */
+#define DECLTYPE(x) (__typeof(x))
+#endif
+#endif
+
+#ifdef NO_DECLTYPE
+#define DECLTYPE(x)
+#define DECLTYPE_ASSIGN(dst, src)                                              \
+  do {                                                                         \
+    char **_da_dst = (char **)(&(dst));                                        \
+    *_da_dst = (char *)(src);                                                  \
+  } while (0)
+#else
+#define DECLTYPE_ASSIGN(dst, src)                                              \
+  do {                                                                         \
+    (dst) = DECLTYPE(dst)(src);                                                \
+  } while (0)
+#endif
+
+#ifndef uthash_malloc
+#define uthash_malloc(sz) malloc(sz) /* malloc fcn                      */
+#endif
+#ifndef uthash_free
+#define uthash_free(ptr, sz) free(ptr) /* free fcn                        */
+#endif
+#ifndef uthash_bzero
+#define uthash_bzero(a, n) memset(a, '\0', n)
+#endif
+#ifndef uthash_strlen
+#define uthash_strlen(s) strlen(s)
+#endif
+
+#ifndef HASH_FUNCTION
+#define HASH_FUNCTION(keyptr, keylen, hashv) HASH_JEN(keyptr, keylen, hashv)
+#endif
+
+#ifndef HASH_KEYCMP
+#define HASH_KEYCMP(a, b, n) memcmp(a, b, n)
+#endif
+
+#ifndef uthash_noexpand_fyi
+#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand  */
+#endif
+#ifndef uthash_expand_fyi
+#define uthash_expand_fyi(tbl) /* can be defined to log expands   */
+#endif
+
+#ifndef HASH_NONFATAL_OOM
+#define HASH_NONFATAL_OOM 0
+#endif
+
+#if HASH_NONFATAL_OOM
+/* malloc failures can be recovered from */
+
+#ifndef uthash_nonfatal_oom
+#define uthash_nonfatal_oom(obj)                                               \
+  do {                                                                         \
+  } while (0) /* non-fatal OOM error */
+#endif
+
+#define HASH_RECORD_OOM(oomed)                                                 \
+  do {                                                                         \
+    (oomed) = 1;                                                               \
+  } while (0)
+#define IF_HASH_NONFATAL_OOM(x) x
+
+#else
+/* malloc failures result in lost memory, hash tables are unusable */
+
+#ifndef uthash_fatal
+#define uthash_fatal(msg) exit(-1) /* fatal OOM error */
+#endif
+
+#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory")
+#define IF_HASH_NONFATAL_OOM(x)
+
+#endif
+
+/* initial number of buckets */
+#define HASH_INITIAL_NUM_BUCKETS 32U /* initial number of buckets        */
+#define HASH_INITIAL_NUM_BUCKETS_LOG2                                          \
+  5U                                 /* lg2 of initial number of buckets       \
+                                      */
+#define HASH_BKT_CAPACITY_THRESH 10U /* expand when bucket count reaches */
+
+/* calculate the element whose hash handle address is hhp */
+#define ELMT_FROM_HH(tbl, hhp) ((void *)(((char *)(hhp)) - ((tbl)->hho)))
+/* calculate the hash handle from element address elp */
+#define HH_FROM_ELMT(tbl, elp)                                                 \
+  ((UT_hash_handle *)(void *)(((char *)(elp)) + ((tbl)->hho)))
+
+#define HASH_ROLLBACK_BKT(hh, head, itemptrhh)                                 \
+  do {                                                                         \
+    struct UT_hash_handle *_hd_hh_item = (itemptrhh);                          \
+    unsigned _hd_bkt;                                                          \
+    HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);     \
+    (head)->hh.tbl->buckets[_hd_bkt].count++;                                  \
+    _hd_hh_item->hh_next = NULL;                                               \
+    _hd_hh_item->hh_prev = NULL;                                               \
+  } while (0)
+
+#define HASH_VALUE(keyptr, keylen, hashv)                                      \
+  do {                                                                         \
+    HASH_FUNCTION(keyptr, keylen, hashv);                                      \
+  } while (0)
+
+#define HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, hashval, out)          \
+  do {                                                                         \
+    (out) = NULL;                                                              \
+    if (head) {                                                                \
+      unsigned _hf_bkt;                                                        \
+      HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt);              \
+      if (HASH_BLOOM_TEST((head)->hh.tbl, hashval) != 0) {                     \
+        HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[_hf_bkt], \
+                         keyptr, keylen, hashval, out);                        \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+#define HASH_FIND(hh, head, keyptr, keylen, out)                               \
+  do {                                                                         \
+    (out) = NULL;                                                              \
+    if (head) {                                                                \
+      unsigned _hf_hashv;                                                      \
+      HASH_VALUE(keyptr, keylen, _hf_hashv);                                   \
+      HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out);         \
+    }                                                                          \
+  } while (0)
+
+#ifdef HASH_BLOOM
+#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM)
+#define HASH_BLOOM_BYTELEN                                                     \
+  (HASH_BLOOM_BITLEN / 8UL) + (((HASH_BLOOM_BITLEN % 8UL) != 0UL) ? 1UL : 0UL)
+#define HASH_BLOOM_MAKE(tbl, oomed)                                            \
+  do {                                                                         \
+    (tbl)->bloom_nbits = HASH_BLOOM;                                           \
+    (tbl)->bloom_bv = (uint8_t *)uthash_malloc(HASH_BLOOM_BYTELEN);            \
+    if (!(tbl)->bloom_bv) {                                                    \
+      HASH_RECORD_OOM(oomed);                                                  \
+    } else {                                                                   \
+      uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                       \
+      (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                 \
+    }                                                                          \
+  } while (0)
+
+#define HASH_BLOOM_FREE(tbl)                                                   \
+  do {                                                                         \
+    uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                          \
+  } while (0)
+
+#define HASH_BLOOM_BITSET(bv, idx) (bv[(idx) / 8U] |= (1U << ((idx) % 8U)))
+#define HASH_BLOOM_BITTEST(bv, idx) (bv[(idx) / 8U] & (1U << ((idx) % 8U)))
+
+#define HASH_BLOOM_ADD(tbl, hashv)                                             \
+  HASH_BLOOM_BITSET((tbl)->bloom_bv,                                           \
+                    ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U)))
+
+#define HASH_BLOOM_TEST(tbl, hashv)                                            \
+  HASH_BLOOM_BITTEST((tbl)->bloom_bv,                                          \
+                     ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U)))
+
+#else
+#define HASH_BLOOM_MAKE(tbl, oomed)
+#define HASH_BLOOM_FREE(tbl)
+#define HASH_BLOOM_ADD(tbl, hashv)
+#define HASH_BLOOM_TEST(tbl, hashv) (1)
+#define HASH_BLOOM_BYTELEN 0U
+#endif
+
+#define HASH_MAKE_TABLE(hh, head, oomed)                                       \
+  do {                                                                         \
+    (head)->hh.tbl = (UT_hash_table *)uthash_malloc(sizeof(UT_hash_table));    \
+    if (!(head)->hh.tbl) {                                                     \
+      HASH_RECORD_OOM(oomed);                                                  \
+    } else {                                                                   \
+      uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table));                     \
+      (head)->hh.tbl->tail = &((head)->hh);                                    \
+      (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS;                  \
+      (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2;        \
+      (head)->hh.tbl->hho = (char *)(&(head)->hh) - (char *)(head);            \
+      (head)->hh.tbl->buckets = (UT_hash_bucket *)uthash_malloc(               \
+          HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket));           \
+      (head)->hh.tbl->signature = HASH_SIGNATURE;                              \
+      if (!(head)->hh.tbl->buckets) {                                          \
+        HASH_RECORD_OOM(oomed);                                                \
+        uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                    \
+      } else {                                                                 \
+        uthash_bzero((head)->hh.tbl->buckets,                                  \
+                     HASH_INITIAL_NUM_BUCKETS *                                \
+                         sizeof(struct UT_hash_bucket));                       \
+        HASH_BLOOM_MAKE((head)->hh.tbl, oomed);                                \
+        IF_HASH_NONFATAL_OOM(if (oomed) {                                      \
+          uthash_free((head)->hh.tbl->buckets,                                 \
+                      HASH_INITIAL_NUM_BUCKETS *                               \
+                          sizeof(struct UT_hash_bucket));                      \
+          uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                  \
+        })                                                                     \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+#define HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in,       \
+                                         hashval, add, replaced, cmpfcn)       \
+  do {                                                                         \
+    (replaced) = NULL;                                                         \
+    HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval,   \
+                          replaced);                                           \
+    if (replaced) {                                                            \
+      HASH_DELETE(hh, head, replaced);                                         \
+    }                                                                          \
+    HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname),         \
+                                        keylen_in, hashval, add, cmpfcn);      \
+  } while (0)
+
+#define HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, hashval, add, \
+                                 replaced)                                     \
+  do {                                                                         \
+    (replaced) = NULL;                                                         \
+    HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval,   \
+                          replaced);                                           \
+    if (replaced) {                                                            \
+      HASH_DELETE(hh, head, replaced);                                         \
+    }                                                                          \
+    HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in,      \
+                                hashval, add);                                 \
+  } while (0)
+
+#define HASH_REPLACE(hh, head, fieldname, keylen_in, add, replaced)            \
+  do {                                                                         \
+    unsigned _hr_hashv;                                                        \
+    HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv);                     \
+    HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add,   \
+                             replaced);                                        \
+  } while (0)
+
+#define HASH_REPLACE_INORDER(hh, head, fieldname, keylen_in, add, replaced,    \
+                             cmpfcn)                                           \
+  do {                                                                         \
+    unsigned _hr_hashv;                                                        \
+    HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv);                     \
+    HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in,           \
+                                     _hr_hashv, add, replaced, cmpfcn);        \
+  } while (0)
+
+#define HASH_APPEND_LIST(hh, head, add)                                        \
+  do {                                                                         \
+    (add)->hh.next = NULL;                                                     \
+    (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail);       \
+    (head)->hh.tbl->tail->next = (add);                                        \
+    (head)->hh.tbl->tail = &((add)->hh);                                       \
+  } while (0)
+
+#define HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn)                            \
+  do {                                                                         \
+    do {                                                                       \
+      if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) {                         \
+        break;                                                                 \
+      }                                                                        \
+    } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next));       \
+  } while (0)
+
+#ifdef NO_DECLTYPE
+#undef HASH_AKBI_INNER_LOOP
+#define HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn)                            \
+  do {                                                                         \
+    char *_hs_saved_head = (char *)(head);                                     \
+    do {                                                                       \
+      DECLTYPE_ASSIGN(head, _hs_iter);                                         \
+      if (cmpfcn(head, add) > 0) {                                             \
+        DECLTYPE_ASSIGN(head, _hs_saved_head);                                 \
+        break;                                                                 \
+      }                                                                        \
+      DECLTYPE_ASSIGN(head, _hs_saved_head);                                   \
+    } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next));       \
+  } while (0)
+#endif
+
+#if HASH_NONFATAL_OOM
+
+#define HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, oomed)    \
+  do {                                                                         \
+    if (!(oomed)) {                                                            \
+      unsigned _ha_bkt;                                                        \
+      (head)->hh.tbl->num_items++;                                             \
+      HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt);              \
+      HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh,        \
+                      oomed);                                                  \
+      if (oomed) {                                                             \
+        HASH_ROLLBACK_BKT(hh, head, &(add)->hh);                               \
+        HASH_DELETE_HH(hh, head, &(add)->hh);                                  \
+        (add)->hh.tbl = NULL;                                                  \
+        uthash_nonfatal_oom(add);                                              \
+      } else {                                                                 \
+        HASH_BLOOM_ADD((head)->hh.tbl, hashval);                               \
+        HASH_EMIT_KEY(hh, head, keyptr, keylen_in);                            \
+      }                                                                        \
+    } else {                                                                   \
+      (add)->hh.tbl = NULL;                                                    \
+      uthash_nonfatal_oom(add);                                                \
+    }                                                                          \
+  } while (0)
+
+#else
+
+#define HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, oomed)    \
+  do {                                                                         \
+    unsigned _ha_bkt;                                                          \
+    (head)->hh.tbl->num_items++;                                               \
+    HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt);                \
+    HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed);  \
+    HASH_BLOOM_ADD((head)->hh.tbl, hashval);                                   \
+    HASH_EMIT_KEY(hh, head, keyptr, keylen_in);                                \
+  } while (0)
+
+#endif
+
+#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in,       \
+                                            hashval, add, cmpfcn)              \
+  do {                                                                         \
+    IF_HASH_NONFATAL_OOM(int _ha_oomed = 0;)                                   \
+    (add)->hh.hashv = (hashval);                                               \
+    (add)->hh.key = (char *)(keyptr);                                          \
+    (add)->hh.keylen = (unsigned)(keylen_in);                                  \
+    if (!(head)) {                                                             \
+      (add)->hh.next = NULL;                                                   \
+      (add)->hh.prev = NULL;                                                   \
+      HASH_MAKE_TABLE(hh, add, _ha_oomed);                                     \
+      IF_HASH_NONFATAL_OOM(if (!_ha_oomed) { )                                    \
+      (head) = (add);                                                          \
+    IF_HASH_NONFATAL_OOM(                                                      \
+      })                                                                       \
+    } else {                                                                   \
+      void *_hs_iter = (head);                                                 \
+      (add)->hh.tbl = (head)->hh.tbl;                                          \
+      HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn);                             \
+      if (_hs_iter) {                                                          \
+        (add)->hh.next = _hs_iter;                                             \
+        if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) { \
+          HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add);          \
+        } else {                                                               \
+          (head) = (add);                                                      \
+        }                                                                      \
+        HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add);                  \
+      } else {                                                                 \
+        HASH_APPEND_LIST(hh, head, add);                                       \
+      }                                                                        \
+    }                                                                          \
+    HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed);   \
+    HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER");                \
+  } while (0)
+
+#define HASH_ADD_KEYPTR_INORDER(hh, head, keyptr, keylen_in, add, cmpfcn)      \
+  do {                                                                         \
+    unsigned _hs_hashv;                                                        \
+    HASH_VALUE(keyptr, keylen_in, _hs_hashv);                                  \
+    HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in,           \
+                                        _hs_hashv, add, cmpfcn);               \
+  } while (0)
+
+#define HASH_ADD_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, hashval,  \
+                                     add, cmpfcn)                              \
+  HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname),           \
+                                      keylen_in, hashval, add, cmpfcn)
+
+#define HASH_ADD_INORDER(hh, head, fieldname, keylen_in, add, cmpfcn)          \
+  HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn)
+
+#define HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, hashval, add) \
+  do {                                                                         \
+    IF_HASH_NONFATAL_OOM(int _ha_oomed = 0;)                                   \
+    (add)->hh.hashv = (hashval);                                               \
+    (add)->hh.key = (const void *)(keyptr);                                    \
+    (add)->hh.keylen = (unsigned)(keylen_in);                                  \
+    if (!(head)) {                                                             \
+      (add)->hh.next = NULL;                                                   \
+      (add)->hh.prev = NULL;                                                   \
+      HASH_MAKE_TABLE(hh, add, _ha_oomed);                                     \
+      IF_HASH_NONFATAL_OOM(if (!_ha_oomed) { )                                    \
+      (head) = (add);                                                          \
+    IF_HASH_NONFATAL_OOM(                                                      \
+      })                                                                       \
+    } else {                                                                   \
+      (add)->hh.tbl = (head)->hh.tbl;                                          \
+      HASH_APPEND_LIST(hh, head, add);                                         \
+    }                                                                          \
+    HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed);   \
+    HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE");                        \
+  } while (0)
+
+#define HASH_ADD_KEYPTR(hh, head, keyptr, keylen_in, add)                      \
+  do {                                                                         \
+    unsigned _ha_hashv;                                                        \
+    HASH_VALUE(keyptr, keylen_in, _ha_hashv);                                  \
+    HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add);  \
+  } while (0)
+
+#define HASH_ADD_BYHASHVALUE(hh, head, fieldname, keylen_in, hashval, add)     \
+  HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in,        \
+                              hashval, add)
+
+#define HASH_ADD(hh, head, fieldname, keylen_in, add)                          \
+  HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add)
+
+#define HASH_TO_BKT(hashv, num_bkts, bkt)                                      \
+  do {                                                                         \
+    bkt = ((hashv) & ((num_bkts)-1U));                                         \
+  } while (0)
+
+/* delete "delptr" from the hash table.
+ *  * "the usual" patch-up process for the app-order doubly-linked-list.
+ *   * The use of _hd_hh_del below deserves special explanation.
+ *    * These used to be expressed using (delptr) but that led to a bug
+ *     * if someone used the same symbol for the head and deletee, like
+ *      *  HASH_DELETE(hh,users,users);
+ *       * We want that to work, but by changing the head (users) below
+ *        * we were forfeiting our ability to further refer to the deletee
+ * (users)
+ *         * in the patch-up process. Solution: use scratch space to
+ *          * copy the deletee pointer, then the latter references are via that
+ *           * scratch pointer rather than through the repointed (users) symbol.
+ *            */
+#define HASH_DELETE(hh, head, delptr) HASH_DELETE_HH(hh, head, &(delptr)->hh)
+
+#define HASH_DELETE_HH(hh, head, delptrhh)                                     \
+  do {                                                                         \
+    struct UT_hash_handle *_hd_hh_del = (delptrhh);                            \
+    if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) {            \
+      HASH_BLOOM_FREE((head)->hh.tbl);                                         \
+      uthash_free((head)->hh.tbl->buckets, (head)->hh.tbl->num_buckets *       \
+                                               sizeof(struct UT_hash_bucket)); \
+      uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                      \
+      (head) = NULL;                                                           \
+    } else {                                                                   \
+      unsigned _hd_bkt;                                                        \
+      if (_hd_hh_del == (head)->hh.tbl->tail) {                                \
+        (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev); \
+      }                                                                        \
+      if (_hd_hh_del->prev != NULL) {                                          \
+        HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next =                 \
+            _hd_hh_del->next;                                                  \
+      } else {                                                                 \
+        DECLTYPE_ASSIGN(head, _hd_hh_del->next);                               \
+      }                                                                        \
+      if (_hd_hh_del->next != NULL) {                                          \
+        HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev =                 \
+            _hd_hh_del->prev;                                                  \
+      }                                                                        \
+      HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);    \
+      HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del);           \
+      (head)->hh.tbl->num_items--;                                             \
+    }                                                                          \
+    HASH_FSCK(hh, head, "HASH_DELETE_HH");                                     \
+  } while (0)
+
+/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */
+#define HASH_FIND_STR(head, findstr, out)                                      \
+  do {                                                                         \
+    unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr);          \
+    HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out);                   \
+  } while (0)
+#define HASH_ADD_STR(head, strfield, add)                                      \
+  do {                                                                         \
+    unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield);  \
+    HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add);                \
+  } while (0)
+#define HASH_REPLACE_STR(head, strfield, add, replaced)                        \
+  do {                                                                         \
+    unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield);  \
+    HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced);  \
+  } while (0)
+#define HASH_FIND_INT(head, findint, out)                                      \
+  HASH_FIND(hh, head, findint, sizeof(int), out)
+#define HASH_ADD_INT(head, intfield, add)                                      \
+  HASH_ADD(hh, head, intfield, sizeof(int), add)
+#define HASH_REPLACE_INT(head, intfield, add, replaced)                        \
+  HASH_REPLACE(hh, head, intfield, sizeof(int), add, replaced)
+#define HASH_FIND_PTR(head, findptr, out)                                      \
+  HASH_FIND(hh, head, findptr, sizeof(void *), out)
+#define HASH_ADD_PTR(head, ptrfield, add)                                      \
+  HASH_ADD(hh, head, ptrfield, sizeof(void *), add)
+#define HASH_REPLACE_PTR(head, ptrfield, add, replaced)                        \
+  HASH_REPLACE(hh, head, ptrfield, sizeof(void *), add, replaced)
+#define HASH_DEL(head, delptr) HASH_DELETE(hh, head, delptr)
+
+/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is
+ * defined.
+ *  * This is for uthash developer only; it compiles away if HASH_DEBUG isn't
+ * defined.
+ *   */
+#ifdef HASH_DEBUG
+#include <stdio.h> /* fprintf, stderr */
+#define HASH_OOPS(...)                                                         \
+  do {                                                                         \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    exit(-1);                                                                  \
+  } while (0)
+#define HASH_FSCK(hh, head, where)                                             \
+  do {                                                                         \
+    struct UT_hash_handle *_thh;                                               \
+    if (head) {                                                                \
+      unsigned _bkt_i;                                                         \
+      unsigned _count = 0;                                                     \
+      char *_prev;                                                             \
+      for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) {       \
+        unsigned _bkt_count = 0;                                               \
+        _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head;                        \
+        _prev = NULL;                                                          \
+        while (_thh) {                                                         \
+          if (_prev != (char *)(_thh->hh_prev)) {                              \
+            HASH_OOPS("%s: invalid hh_prev %p, actual %p\n", (where),          \
+                      (void *)_thh->hh_prev, (void *)_prev);                   \
+          }                                                                    \
+          _bkt_count++;                                                        \
+          _prev = (char *)(_thh);                                              \
+          _thh = _thh->hh_next;                                                \
+        }                                                                      \
+        _count += _bkt_count;                                                  \
+        if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) {             \
+          HASH_OOPS("%s: invalid bucket count %u, actual %u\n", (where),       \
+                    (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);        \
+        }                                                                      \
+      }                                                                        \
+      if (_count != (head)->hh.tbl->num_items) {                               \
+        HASH_OOPS("%s: invalid hh item count %u, actual %u\n", (where),        \
+                  (head)->hh.tbl->num_items, _count);                          \
+      }                                                                        \
+      _count = 0;                                                              \
+      _prev = NULL;                                                            \
+      _thh = &(head)->hh;                                                      \
+      while (_thh) {                                                           \
+        _count++;                                                              \
+        if (_prev != (char *)_thh->prev) {                                     \
+          HASH_OOPS("%s: invalid prev %p, actual %p\n", (where),               \
+                    (void *)_thh->prev, (void *)_prev);                        \
+        }                                                                      \
+        _prev = (char *)ELMT_FROM_HH((head)->hh.tbl, _thh);                    \
+        _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL); \
+      }                                                                        \
+      if (_count != (head)->hh.tbl->num_items) {                               \
+        HASH_OOPS("%s: invalid app item count %u, actual %u\n", (where),       \
+                  (head)->hh.tbl->num_items, _count);                          \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+#else
+#define HASH_FSCK(hh, head, where)
+#endif
+
+/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to
+ *  * the descriptor to which this macro is defined for tuning the hash
+ * function.
+ *   * The app can #include <unistd.h> to get the prototype for write(2). */
+#ifdef HASH_EMIT_KEYS
+#define HASH_EMIT_KEY(hh, head, keyptr, fieldlen)                              \
+  do {                                                                         \
+    unsigned _klen = fieldlen;                                                 \
+    write(HASH_EMIT_KEYS, &_klen, sizeof(_klen));                              \
+    write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen);                    \
+  } while (0)
+#else
+#define HASH_EMIT_KEY(hh, head, keyptr, fieldlen)
+#endif
+
+/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33.
+ */
+#define HASH_BER(key, keylen, hashv)                                           \
+  do {                                                                         \
+    unsigned _hb_keylen = (unsigned)keylen;                                    \
+    const unsigned char *_hb_key = (const unsigned char *)(key);               \
+    (hashv) = 0;                                                               \
+    while (_hb_keylen-- != 0U) {                                               \
+      (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++;                       \
+    }                                                                          \
+  } while (0)
+
+/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at
+ *  * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
+ *   * (archive link: https://archive.is/Ivcan )
+ *    */
+#define HASH_SAX(key, keylen, hashv)                                           \
+  do {                                                                         \
+    unsigned _sx_i;                                                            \
+    const unsigned char *_hs_key = (const unsigned char *)(key);               \
+    hashv = 0;                                                                 \
+    for (_sx_i = 0; _sx_i < keylen; _sx_i++) {                                 \
+      hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i];                   \
+    }                                                                          \
+  } while (0)
+/* FNV-1a variation */
+#define HASH_FNV(key, keylen, hashv)                                           \
+  do {                                                                         \
+    unsigned _fn_i;                                                            \
+    const unsigned char *_hf_key = (const unsigned char *)(key);               \
+    (hashv) = 2166136261U;                                                     \
+    for (_fn_i = 0; _fn_i < keylen; _fn_i++) {                                 \
+      hashv = hashv ^ _hf_key[_fn_i];                                          \
+      hashv = hashv * 16777619U;                                               \
+    }                                                                          \
+  } while (0)
+
+#define HASH_OAT(key, keylen, hashv)                                           \
+  do {                                                                         \
+    unsigned _ho_i;                                                            \
+    const unsigned char *_ho_key = (const unsigned char *)(key);               \
+    hashv = 0;                                                                 \
+    for (_ho_i = 0; _ho_i < keylen; _ho_i++) {                                 \
+      hashv += _ho_key[_ho_i];                                                 \
+      hashv += (hashv << 10);                                                  \
+      hashv ^= (hashv >> 6);                                                   \
+    }                                                                          \
+    hashv += (hashv << 3);                                                     \
+    hashv ^= (hashv >> 11);                                                    \
+    hashv += (hashv << 15);                                                    \
+  } while (0)
+
+#define HASH_JEN_MIX(a, b, c)                                                  \
+  do {                                                                         \
+    a -= b;                                                                    \
+    a -= c;                                                                    \
+    a ^= (c >> 13);                                                            \
+    b -= c;                                                                    \
+    b -= a;                                                                    \
+    b ^= (a << 8);                                                             \
+    c -= a;                                                                    \
+    c -= b;                                                                    \
+    c ^= (b >> 13);                                                            \
+    a -= b;                                                                    \
+    a -= c;                                                                    \
+    a ^= (c >> 12);                                                            \
+    b -= c;                                                                    \
+    b -= a;                                                                    \
+    b ^= (a << 16);                                                            \
+    c -= a;                                                                    \
+    c -= b;                                                                    \
+    c ^= (b >> 5);                                                             \
+    a -= b;                                                                    \
+    a -= c;                                                                    \
+    a ^= (c >> 3);                                                             \
+    b -= c;                                                                    \
+    b -= a;                                                                    \
+    b ^= (a << 10);                                                            \
+    c -= a;                                                                    \
+    c -= b;                                                                    \
+    c ^= (b >> 15);                                                            \
+  } while (0)
+
+#define HASH_JEN(key, keylen, hashv)                                           \
+  do {                                                                         \
+    unsigned _hj_i, _hj_j, _hj_k;                                              \
+    unsigned const char *_hj_key = (unsigned const char *)(key);               \
+    hashv = 0xfeedbeefu;                                                       \
+    _hj_i = _hj_j = 0x9e3779b9u;                                               \
+    _hj_k = (unsigned)(keylen);                                                \
+    while (_hj_k >= 12U) {                                                     \
+      _hj_i += (_hj_key[0] + ((unsigned)_hj_key[1] << 8) +                     \
+                ((unsigned)_hj_key[2] << 16) + ((unsigned)_hj_key[3] << 24));  \
+      _hj_j += (_hj_key[4] + ((unsigned)_hj_key[5] << 8) +                     \
+                ((unsigned)_hj_key[6] << 16) + ((unsigned)_hj_key[7] << 24));  \
+      hashv +=                                                                 \
+          (_hj_key[8] + ((unsigned)_hj_key[9] << 8) +                          \
+           ((unsigned)_hj_key[10] << 16) + ((unsigned)_hj_key[11] << 24));     \
+                                                                               \
+      HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                       \
+                                                                               \
+      _hj_key += 12;                                                           \
+      _hj_k -= 12U;                                                            \
+    }                                                                          \
+    hashv += (unsigned)(keylen);                                               \
+    switch (_hj_k) {                                                           \
+    case 11:                                                                   \
+      hashv += ((unsigned)_hj_key[10] << 24); /* FALLTHROUGH */                \
+    case 10:                                                                   \
+      hashv += ((unsigned)_hj_key[9] << 16); /* FALLTHROUGH */                 \
+    case 9:                                                                    \
+      hashv += ((unsigned)_hj_key[8] << 8); /* FALLTHROUGH */                  \
+    case 8:                                                                    \
+      _hj_j += ((unsigned)_hj_key[7] << 24); /* FALLTHROUGH */                 \
+    case 7:                                                                    \
+      _hj_j += ((unsigned)_hj_key[6] << 16); /* FALLTHROUGH */                 \
+    case 6:                                                                    \
+      _hj_j += ((unsigned)_hj_key[5] << 8); /* FALLTHROUGH */                  \
+    case 5:                                                                    \
+      _hj_j += _hj_key[4]; /* FALLTHROUGH */                                   \
+    case 4:                                                                    \
+      _hj_i += ((unsigned)_hj_key[3] << 24); /* FALLTHROUGH */                 \
+    case 3:                                                                    \
+      _hj_i += ((unsigned)_hj_key[2] << 16); /* FALLTHROUGH */                 \
+    case 2:                                                                    \
+      _hj_i += ((unsigned)_hj_key[1] << 8); /* FALLTHROUGH */                  \
+    case 1:                                                                    \
+      _hj_i += _hj_key[0]; /* FALLTHROUGH */                                   \
+    default:;                                                                  \
+    }                                                                          \
+    HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                         \
+  } while (0)
+
+/* The Paul Hsieh hash function */
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) ||        \
+    defined(_MSC_VER) || defined(__BORLANDC__) || defined(__TURBOC__)
+#define get16bits(d) (*((const uint16_t *)(d)))
+#endif
+
+#if !defined(get16bits)
+#define get16bits(d)                                                           \
+  ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) +                            \
+   (uint32_t)(((const uint8_t *)(d))[0]))
+#endif
+#define HASH_SFH(key, keylen, hashv)                                           \
+  do {                                                                         \
+    unsigned const char *_sfh_key = (unsigned const char *)(key);              \
+    uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen;                            \
+                                                                               \
+    unsigned _sfh_rem = _sfh_len & 3U;                                         \
+    _sfh_len >>= 2;                                                            \
+    hashv = 0xcafebabeu;                                                       \
+                                                                               \
+    /* Main loop */                                                            \
+    for (; _sfh_len > 0U; _sfh_len--) {                                        \
+      hashv += get16bits(_sfh_key);                                            \
+      _sfh_tmp = ((uint32_t)(get16bits(_sfh_key + 2)) << 11) ^ hashv;          \
+      hashv = (hashv << 16) ^ _sfh_tmp;                                        \
+      _sfh_key += 2U * sizeof(uint16_t);                                       \
+      hashv += hashv >> 11;                                                    \
+    }                                                                          \
+                                                                               \
+    /* Handle end cases */                                                     \
+    switch (_sfh_rem) {                                                        \
+    case 3:                                                                    \
+      hashv += get16bits(_sfh_key);                                            \
+      hashv ^= hashv << 16;                                                    \
+      hashv ^= (uint32_t)(_sfh_key[sizeof(uint16_t)]) << 18;                   \
+      hashv += hashv >> 11;                                                    \
+      break;                                                                   \
+    case 2:                                                                    \
+      hashv += get16bits(_sfh_key);                                            \
+      hashv ^= hashv << 11;                                                    \
+      hashv += hashv >> 17;                                                    \
+      break;                                                                   \
+    case 1:                                                                    \
+      hashv += *_sfh_key;                                                      \
+      hashv ^= hashv << 10;                                                    \
+      hashv += hashv >> 1;                                                     \
+      break;                                                                   \
+    default:;                                                                  \
+    }                                                                          \
+                                                                               \
+    /* Force "avalanching" of final 127 bits */                                \
+    hashv ^= hashv << 3;                                                       \
+    hashv += hashv >> 5;                                                       \
+    hashv ^= hashv << 4;                                                       \
+    hashv += hashv >> 17;                                                      \
+    hashv ^= hashv << 25;                                                      \
+    hashv += hashv >> 6;                                                       \
+  } while (0)
+
+/* iterate over items in a known bucket to find desired item */
+#define HASH_FIND_IN_BKT(tbl, hh, head, keyptr, keylen_in, hashval, out)       \
+  do {                                                                         \
+    if ((head).hh_head != NULL) {                                              \
+      DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head));                 \
+    } else {                                                                   \
+      (out) = NULL;                                                            \
+    }                                                                          \
+    while ((out) != NULL) {                                                    \
+      if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) {   \
+        if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) {              \
+          break;                                                               \
+        }                                                                      \
+      }                                                                        \
+      if ((out)->hh.hh_next != NULL) {                                         \
+        DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next));            \
+      } else {                                                                 \
+        (out) = NULL;                                                          \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+/* add an item to a bucket  */
+#define HASH_ADD_TO_BKT(head, hh, addhh, oomed)                                \
+  do {                                                                         \
+    UT_hash_bucket *_ha_head = &(head);                                        \
+    _ha_head->count++;                                                         \
+    (addhh)->hh_next = _ha_head->hh_head;                                      \
+    (addhh)->hh_prev = NULL;                                                   \
+    if (_ha_head->hh_head != NULL) {                                           \
+      _ha_head->hh_head->hh_prev = (addhh);                                    \
+    }                                                                          \
+    _ha_head->hh_head = (addhh);                                               \
+    if ((_ha_head->count >=                                                    \
+         ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) &&         \
+        !(addhh)->tbl->noexpand) {                                             \
+      HASH_EXPAND_BUCKETS(addhh, (addhh)->tbl, oomed);                         \
+      IF_HASH_NONFATAL_OOM(if (oomed) { HASH_DEL_IN_BKT(head, addhh); })       \
+    }                                                                          \
+  } while (0)
+
+/* remove an item from a given bucket */
+#define HASH_DEL_IN_BKT(head, delhh)                                           \
+  do {                                                                         \
+    UT_hash_bucket *_hd_head = &(head);                                        \
+    _hd_head->count--;                                                         \
+    if (_hd_head->hh_head == (delhh)) {                                        \
+      _hd_head->hh_head = (delhh)->hh_next;                                    \
+    }                                                                          \
+    if ((delhh)->hh_prev) {                                                    \
+      (delhh)->hh_prev->hh_next = (delhh)->hh_next;                            \
+    }                                                                          \
+    if ((delhh)->hh_next) {                                                    \
+      (delhh)->hh_next->hh_prev = (delhh)->hh_prev;                            \
+    }                                                                          \
+  } while (0)
+
+/* Bucket expansion has the effect of doubling the number of buckets
+ *  * and redistributing the items into the new buckets. Ideally the
+ *   * items will distribute more or less evenly into the new buckets
+ *    * (the extent to which this is true is a measure of the quality of
+ *     * the hash function as it applies to the key domain).
+ *      *
+ *       * With the items distributed into more buckets, the chain length
+ *        * (item count) in each bucket is reduced. Thus by expanding buckets
+ *         * the hash keeps a bound on the chain length. This bounded chain
+ *          * length is the essence of how a hash provides constant time lookup.
+ *           *
+ *            * The calculation of tbl->ideal_chain_maxlen below deserves some
+ *             * explanation. First, keep in mind that we're calculating the
+ * ideal
+ *              * maximum chain length based on the *new* (doubled) bucket
+ * count.
+ *               * In fractions this is just n/b (n=number of items,b=new num
+ * buckets).
+ *                * Since the ideal chain length is an integer, we want to
+ * calculate
+ *                 * ceil(n/b). We don't depend on floating point arithmetic in
+ * this
+ *                  * hash, so to calculate ceil(n/b) with integers we could
+ * write
+ *                   *
+ *                    *      ceil(n/b) = (n/b) + ((n%b)?1:0)
+ *                     *
+ *                      * and in fact a previous version of this hash did just
+ * that.
+ *                       * But now we have improved things a bit by recognizing
+ * that b is
+ *                        * always a power of two. We keep its base 2 log handy
+ * (call it lb),
+ *                         * so now we can write this with a bit shift and
+ * logical AND:
+ *                          *
+ *                           *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
+ *                            *
+ *                             */
+#define HASH_EXPAND_BUCKETS(hh, tbl, oomed)                                    \
+  do {                                                                         \
+    unsigned _he_bkt;                                                          \
+    unsigned _he_bkt_i;                                                        \
+    struct UT_hash_handle *_he_thh, *_he_hh_nxt;                               \
+    UT_hash_bucket *_he_new_buckets, *_he_newbkt;                              \
+    _he_new_buckets = (UT_hash_bucket *)uthash_malloc(                         \
+        sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U);              \
+    if (!_he_new_buckets) {                                                    \
+      HASH_RECORD_OOM(oomed);                                                  \
+    } else {                                                                   \
+      uthash_bzero(_he_new_buckets,                                            \
+                   sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U);   \
+      (tbl)->ideal_chain_maxlen =                                              \
+          ((tbl)->num_items >> ((tbl)->log2_num_buckets + 1U)) +               \
+          ((((tbl)->num_items & (((tbl)->num_buckets * 2U) - 1U)) != 0U)       \
+               ? 1U                                                            \
+               : 0U);                                                          \
+      (tbl)->nonideal_items = 0;                                               \
+      for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) {       \
+        _he_thh = (tbl)->buckets[_he_bkt_i].hh_head;                           \
+        while (_he_thh != NULL) {                                              \
+          _he_hh_nxt = _he_thh->hh_next;                                       \
+          HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt);       \
+          _he_newbkt = &(_he_new_buckets[_he_bkt]);                            \
+          if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) {             \
+            (tbl)->nonideal_items++;                                           \
+            if (_he_newbkt->count >                                            \
+                _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) {         \
+              _he_newbkt->expand_mult++;                                       \
+            }                                                                  \
+          }                                                                    \
+          _he_thh->hh_prev = NULL;                                             \
+          _he_thh->hh_next = _he_newbkt->hh_head;                              \
+          if (_he_newbkt->hh_head != NULL) {                                   \
+            _he_newbkt->hh_head->hh_prev = _he_thh;                            \
+          }                                                                    \
+          _he_newbkt->hh_head = _he_thh;                                       \
+          _he_thh = _he_hh_nxt;                                                \
+        }                                                                      \
+      }                                                                        \
+      uthash_free((tbl)->buckets,                                              \
+                  (tbl)->num_buckets * sizeof(struct UT_hash_bucket));         \
+      (tbl)->num_buckets *= 2U;                                                \
+      (tbl)->log2_num_buckets++;                                               \
+      (tbl)->buckets = _he_new_buckets;                                        \
+      (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) \
+                                 ? ((tbl)->ineff_expands + 1U)                 \
+                                 : 0U;                                         \
+      if ((tbl)->ineff_expands > 1U) {                                         \
+        (tbl)->noexpand = 1;                                                   \
+        uthash_noexpand_fyi(tbl);                                              \
+      }                                                                        \
+      uthash_expand_fyi(tbl);                                                  \
+    }                                                                          \
+  } while (0)
+
+/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
+/* Note that HASH_SORT assumes the hash handle name to be hh.
+ *  * HASH_SRT was added to allow the hash handle name to be passed in. */
+#define HASH_SORT(head, cmpfcn) HASH_SRT(hh, head, cmpfcn)
+#define HASH_SRT(hh, head, cmpfcn)                                             \
+  do {                                                                         \
+    unsigned _hs_i;                                                            \
+    unsigned _hs_looping, _hs_nmerges, _hs_insize, _hs_psize, _hs_qsize;       \
+    struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail;        \
+    if (head != NULL) {                                                        \
+      _hs_insize = 1;                                                          \
+      _hs_looping = 1;                                                         \
+      _hs_list = &((head)->hh);                                                \
+      while (_hs_looping != 0U) {                                              \
+        _hs_p = _hs_list;                                                      \
+        _hs_list = NULL;                                                       \
+        _hs_tail = NULL;                                                       \
+        _hs_nmerges = 0;                                                       \
+        while (_hs_p != NULL) {                                                \
+          _hs_nmerges++;                                                       \
+          _hs_q = _hs_p;                                                       \
+          _hs_psize = 0;                                                       \
+          for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) {                       \
+            _hs_psize++;                                                       \
+            _hs_q = ((_hs_q->next != NULL)                                     \
+                         ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next)           \
+                         : NULL);                                              \
+            if (_hs_q == NULL) {                                               \
+              break;                                                           \
+            }                                                                  \
+          }                                                                    \
+          _hs_qsize = _hs_insize;                                              \
+          while ((_hs_psize != 0U) ||                                          \
+                 ((_hs_qsize != 0U) && (_hs_q != NULL))) {                     \
+            if (_hs_psize == 0U) {                                             \
+              _hs_e = _hs_q;                                                   \
+              _hs_q = ((_hs_q->next != NULL)                                   \
+                           ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next)         \
+                           : NULL);                                            \
+              _hs_qsize--;                                                     \
+            } else if ((_hs_qsize == 0U) || (_hs_q == NULL)) {                 \
+              _hs_e = _hs_p;                                                   \
+              if (_hs_p != NULL) {                                             \
+                _hs_p = ((_hs_p->next != NULL)                                 \
+                             ? HH_FROM_ELMT((head)->hh.tbl, _hs_p->next)       \
+                             : NULL);                                          \
+              }                                                                \
+              _hs_psize--;                                                     \
+            } else if ((cmpfcn(DECLTYPE(head)(                                 \
+                                   ELMT_FROM_HH((head)->hh.tbl, _hs_p)),       \
+                               DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,     \
+                                                           _hs_q)))) <= 0) {   \
+              _hs_e = _hs_p;                                                   \
+              if (_hs_p != NULL) {                                             \
+                _hs_p = ((_hs_p->next != NULL)                                 \
+                             ? HH_FROM_ELMT((head)->hh.tbl, _hs_p->next)       \
+                             : NULL);                                          \
+              }                                                                \
+              _hs_psize--;                                                     \
+            } else {                                                           \
+              _hs_e = _hs_q;                                                   \
+              _hs_q = ((_hs_q->next != NULL)                                   \
+                           ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next)         \
+                           : NULL);                                            \
+              _hs_qsize--;                                                     \
+            }                                                                  \
+            if (_hs_tail != NULL) {                                            \
+              _hs_tail->next =                                                 \
+                  ((_hs_e != NULL) ? ELMT_FROM_HH((head)->hh.tbl, _hs_e)       \
+                                   : NULL);                                    \
+            } else {                                                           \
+              _hs_list = _hs_e;                                                \
+            }                                                                  \
+            if (_hs_e != NULL) {                                               \
+              _hs_e->prev =                                                    \
+                  ((_hs_tail != NULL) ? ELMT_FROM_HH((head)->hh.tbl, _hs_tail) \
+                                      : NULL);                                 \
+            }                                                                  \
+            _hs_tail = _hs_e;                                                  \
+          }                                                                    \
+          _hs_p = _hs_q;                                                       \
+        }                                                                      \
+        if (_hs_tail != NULL) {                                                \
+          _hs_tail->next = NULL;                                               \
+        }                                                                      \
+        if (_hs_nmerges <= 1U) {                                               \
+          _hs_looping = 0;                                                     \
+          (head)->hh.tbl->tail = _hs_tail;                                     \
+          DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list));       \
+        }                                                                      \
+        _hs_insize *= 2U;                                                      \
+      }                                                                        \
+      HASH_FSCK(hh, head, "HASH_SRT");                                         \
+    }                                                                          \
+  } while (0)
+
+/* This function selects items from one hash into another hash.
+ *  * The end result is that the selected items have dual presence
+ *   * in both hashes. There is no copy of the items made; rather
+ *    * they are added into the new hash through a secondary hash
+ *     * hash handle that must be present in the structure. */
+#define HASH_SELECT(hh_dst, dst, hh_src, src, cond)                            \
+  do {                                                                         \
+    unsigned _src_bkt, _dst_bkt;                                               \
+    void *_last_elt = NULL, *_elt;                                             \
+    UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh = NULL;                   \
+    ptrdiff_t _dst_hho = ((char *)(&(dst)->hh_dst) - (char *)(dst));           \
+    if ((src) != NULL) {                                                       \
+      for (_src_bkt = 0; _src_bkt < (src)->hh_src.tbl->num_buckets;            \
+           _src_bkt++) {                                                       \
+        for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head;           \
+             _src_hh != NULL; _src_hh = _src_hh->hh_next) {                    \
+          _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh);                     \
+          if (cond(_elt)) {                                                    \
+            IF_HASH_NONFATAL_OOM(int _hs_oomed = 0;)                           \
+            _dst_hh = (UT_hash_handle *)(void *)(((char *)_elt) + _dst_hho);   \
+            _dst_hh->key = _src_hh->key;                                       \
+            _dst_hh->keylen = _src_hh->keylen;                                 \
+            _dst_hh->hashv = _src_hh->hashv;                                   \
+            _dst_hh->prev = _last_elt;                                         \
+            _dst_hh->next = NULL;                                              \
+            if (_last_elt_hh != NULL) {                                        \
+              _last_elt_hh->next = _elt;                                       \
+            }                                                                  \
+            if ((dst) == NULL) {                                               \
+              DECLTYPE_ASSIGN(dst, _elt);                                      \
+              HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed);                         \
+              IF_HASH_NONFATAL_OOM(if (_hs_oomed) {                            \
+                uthash_nonfatal_oom(_elt);                                     \
+                (dst) = NULL;                                                  \
+                continue;                                                      \
+              })                                                               \
+            } else {                                                           \
+              _dst_hh->tbl = (dst)->hh_dst.tbl;                                \
+            }                                                                  \
+            HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt);  \
+            HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh,  \
+                            _hs_oomed);                                        \
+            (dst)->hh_dst.tbl->num_items++;                                    \
+            IF_HASH_NONFATAL_OOM(if (_hs_oomed) {                              \
+              HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh);                         \
+              HASH_DELETE_HH(hh_dst, dst, _dst_hh);                            \
+              _dst_hh->tbl = NULL;                                             \
+              uthash_nonfatal_oom(_elt);                                       \
+              continue;                                                        \
+            })                                                                 \
+            HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv);                      \
+            _last_elt = _elt;                                                  \
+            _last_elt_hh = _dst_hh;                                            \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    HASH_FSCK(hh_dst, dst, "HASH_SELECT");                                     \
+  } while (0)
+
+#define HASH_CLEAR(hh, head)                                                   \
+  do {                                                                         \
+    if ((head) != NULL) {                                                      \
+      HASH_BLOOM_FREE((head)->hh.tbl);                                         \
+      uthash_free((head)->hh.tbl->buckets, (head)->hh.tbl->num_buckets *       \
+                                               sizeof(struct UT_hash_bucket)); \
+      uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                      \
+      (head) = NULL;                                                           \
+    }                                                                          \
+  } while (0)
+
+#define HASH_OVERHEAD(hh, head)                                                \
+  (((head) != NULL)                                                            \
+       ? ((size_t)(((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) +      \
+                   ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) +    \
+                   sizeof(UT_hash_table) + (HASH_BLOOM_BYTELEN)))              \
+       : 0U)
+
+#ifdef NO_DECLTYPE
+#define HASH_ITER(hh, head, el, tmp)                                           \
+  for (((el) = (head)),                                                        \
+       ((*(char **)(&(tmp))) =                                                 \
+            (char *)((head != NULL) ? (head)->hh.next : NULL));                \
+       (el) != NULL;                                                           \
+       ((el) = (tmp)), ((*(char **)(&(tmp))) =                                 \
+                            (char *)((tmp != NULL) ? (tmp)->hh.next : NULL)))
+#else
+#define HASH_ITER(hh, head, el, tmp)                                           \
+  for (((el) = (head)),                                                        \
+       ((tmp) = DECLTYPE(el)((head != NULL) ? (head)->hh.next : NULL));        \
+       (el) != NULL;                                                           \
+       ((el) = (tmp)),                                                         \
+       ((tmp) = DECLTYPE(el)((tmp != NULL) ? (tmp)->hh.next : NULL)))
+#endif
+
+/* obtain a count of items in the hash */
+#define HASH_COUNT(head) HASH_CNT(hh, head)
+#define HASH_CNT(hh, head) ((head != NULL) ? ((head)->hh.tbl->num_items) : 0U)
+
+typedef struct UT_hash_bucket {
+  struct UT_hash_handle *hh_head;
+  unsigned count;
+
+  /* expand_mult is normally set to 0. In this situation, the max chain length
+   *     * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH.
+   * (If
+   *         * the bucket's chain exceeds this length, bucket expansion is
+   * triggered).
+   *             * However, setting expand_mult to a non-zero value delays
+   * bucket expansion
+   *                 * (that would be triggered by additions to this particular
+   * bucket)
+   *                     * until its chain length reaches a *multiple* of
+   * HASH_BKT_CAPACITY_THRESH.
+   *                         * (The multiplier is simply expand_mult+1). The
+   * whole idea of this
+   *                             * multiplier is to reduce bucket expansions,
+   * since they are expensive, in
+   *                                 * situations where we know that a
+   * particular bucket tends to be overused.
+   *                                     * It is better to let its chain length
+   * grow to a longer yet-still-bounded
+   *                                         * value, than to do an O(n) bucket
+   * expansion too often.
+   *                                             */
+  unsigned expand_mult;
+
+} UT_hash_bucket;
+
+/* random signature used only to find hash tables in external analysis */
+#define HASH_SIGNATURE 0xa0111fe1u
+#define HASH_BLOOM_SIGNATURE 0xb12220f2u
+
+typedef struct UT_hash_table {
+  UT_hash_bucket *buckets;
+  unsigned num_buckets, log2_num_buckets;
+  unsigned num_items;
+  struct UT_hash_handle *tail; /* tail hh in app order, for fast append    */
+  ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */
+
+  /* in an ideal situation (all buckets used equally), no bucket would have
+   *     * more than ceil(#items/#buckets) items. that's the ideal chain length.
+   */
+  unsigned ideal_chain_maxlen;
+
+  /* nonideal_items is the number of items in the hash whose chain position
+   *     * exceeds the ideal chain maxlen. these items pay the penalty for an
+   * uneven
+   *         * hash distribution; reaching them in a chain traversal takes
+   * >ideal steps */
+  unsigned nonideal_items;
+
+  /* ineffective expands occur when a bucket doubling was performed, but
+   *     * afterward, more than half the items in the hash had nonideal chain
+   *         * positions. If this happens on two consecutive expansions we
+   * inhibit any
+   *             * further expansion, as it's not helping; this happens when the
+   * hash
+   *                 * function isn't a good fit for the key domain. When
+   * expansion is inhibited
+   *                     * the hash will still work, albeit no longer in
+   * constant time. */
+  unsigned ineff_expands, noexpand;
+
+  uint32_t signature; /* used only to find hash tables in external analysis */
+#ifdef HASH_BLOOM
+  uint32_t bloom_sig; /* used only to test bloom exists in external analysis */
+  uint8_t *bloom_bv;
+  uint8_t bloom_nbits;
+#endif
+
+} UT_hash_table;
+
+typedef struct UT_hash_handle {
+  struct UT_hash_table *tbl;
+  void *prev;                     /* prev element in app order      */
+  void *next;                     /* next element in app order      */
+  struct UT_hash_handle *hh_prev; /* previous hh in bucket order    */
+  struct UT_hash_handle *hh_next; /* next hh in bucket order        */
+  const void *key;                /* ptr to enclosing struct's key  */
+  unsigned keylen;                /* enclosing struct's key len     */
+  unsigned hashv;                 /* result of hash-fcn(key)        */
+} UT_hash_handle;
+
+#endif /* UTHASH_H */
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..a6343a9
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.10)
+project(memory-accelerator C)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_subdirectory(shm_malloc)
+add_subdirectory(posix)
+add_subdirectory(controller)
+add_subdirectory(client)
+add_subdirectory(shared)
+
+set(SHARED_SOURCE_FILES ${SHARED_SOURCE_FILES} PARENT_SCOPE)
+set(CLIENT_SOURCE_FILES ${CLIENT_SOURCE_FILES} PARENT_SCOPE)
+set(CONTROLLER_SOURCE_FILES ${CONTROLLER_SOURCE_FILES} PARENT_SCOPE)
+
+add_compile_options(-Wall -Wextra -pedantic -Werror)
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
new file mode 100644
index 0000000..6b10a84
--- /dev/null
+++ b/src/client/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.10)
+project(memory-accelerator C)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_compile_options(-Wall -Wextra -pedantic -Werror)
+
+set(CLIENT_SOURCE_FILES ${CLIENT_SOURCE_FILES} 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/client_cleanup.c 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/client_init.c 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/client_memory.c 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/client_place_requests.c 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/client_read_location.c 
+                        PARENT_SCOPE)
diff --git a/src/client/client_cleanup.c b/src/client/client_cleanup.c
new file mode 100644
index 0000000..5bd8f85
--- /dev/null
+++ b/src/client/client_cleanup.c
@@ -0,0 +1,29 @@
+#include "client_cleanup.h"
+#include "posix_sm.h"
+#include "request.h"
+
+#include "shm_malloc.h"
+
+#include <unistd.h>
+
+void cleanup_queues(struct client *client) {
+  request_queue_deactivate(client->shared_requests);
+  request_queue_deactivate(client->shared_completions);
+
+  client->shared_requests = NULL;
+  client->shared_completions = NULL;
+}
+
+void cleanup_shared_mem(struct client *client) {
+  scoria_sm_unmap(client->shared_location, sizeof(struct memory_location),
+              "client:unmap");
+
+  close(client->fd_location);
+  close(client->fd_requests);
+  close(client->fd_completions);
+}
+
+void cleanup(struct client *client) {
+  cleanup_queues(client);
+  cleanup_shared_mem(client);
+}
diff --git a/src/client/client_init.c b/src/client/client_init.c
new file mode 100644
index 0000000..a18a21e
--- /dev/null
+++ b/src/client/client_init.c
@@ -0,0 +1,135 @@
+#include "client_init.h"
+#include "client_read_location.h"
+
+#include "client.h"
+#include "config.h"
+#include "posix_sm.h"
+#include "request.h"
+#include "utils.h"
+
+#include "shm_malloc.h"
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mman.h>
+
+void init_memory_pool(struct client *client) {
+  if (shm_init(SHARED_MEMORY_NAME, setup) < 0)
+    scoria_error("Client:shm_init");
+
+  client->shared_mem_ptr = shm_global();
+
+  if (client->chatty) {
+    printf("Client: Mapped Shared Memory Address: %p %p\n",
+           (void *)client->shared_mem_ptr,
+           (void *)client->shared_location->shared_mem_ptr);
+
+    if (client->shared_mem_ptr == client->shared_location->shared_mem_ptr)
+      printf("Client: Successfully Mapped Shared Memory Address\n");
+    else
+      // TODO: Error handling
+      scoria_error("Client: Mapped Shared Memory to Incorrect Address\n");
+  }
+}
+
+void init_requests(struct client *client) {
+  client->fd_requests =
+      scoria_sm_open(SHARED_REQUESTS_NAME, O_RDWR, 0, "client:shm_open");
+  client->shared_requests_list =
+      scoria_sm_map(client->shared_location->shared_requests_list,
+                    sizeof(struct request_queue_list), PROT_READ | PROT_WRITE,
+                    MAP_SHARED, client->fd_requests, 0, "client:mmap");
+
+  if (client->chatty) {
+    if (client->shared_requests_list ==
+        client->shared_location->shared_requests_list)
+      printf("Client: Successfully Mapped Shared Request Queue List to "
+             "Address: %p %p\n",
+             (void *)client->shared_requests_list,
+             (void *)client->shared_location->shared_requests_list);
+    else
+      // TODO: Error handling
+      scoria_error(
+          "Client: Mapped Shared Request Queue List to Incorrect Address\n");
+  }
+}
+
+void init_completions(struct client *client) {
+  client->fd_completions =
+      scoria_sm_open(SHARED_COMPLETIONS_NAME, O_RDWR, 0, "client:shm_open");
+  client->shared_completions_list =
+      scoria_sm_map(client->shared_location->shared_completions_list,
+                    sizeof(struct request_queue_list), PROT_READ | PROT_WRITE,
+                    MAP_SHARED, client->fd_completions, 0, "client:mmap");
+
+  if (client->chatty) {
+    if (client->shared_completions_list ==
+        client->shared_location->shared_completions_list)
+      printf("Client: Successfully Mapped Shared Completion Queue List to "
+             "Address: %p "
+             "%p\n",
+             (void *)client->shared_completions_list,
+             (void *)client->shared_location->shared_completions_list);
+    else
+      // TODO: Error handling
+      scoria_error("Client: Mapped Shared Completiong Queue List to Incorrect "
+                   "Address\n");
+  }
+}
+
+void init_virtual_address_mailbox(struct client *client) {
+  client->fd_location =
+      scoria_sm_open(SHARED_LOCATION_NAME, O_RDWR, 0, "client:shm_open");
+  client->shared_location = scoria_sm_map(
+      NULL, sizeof(struct memory_location), PROT_READ | PROT_WRITE, MAP_SHARED,
+      client->fd_location, 0, "client:mmap");
+}
+
+void init_id(struct client *client) {
+  int id = -1;
+
+  for (int i = 0; i < MAX_CLIENTS; i++) {
+    if (client->shared_requests_list->queues[i].active == 0) {
+      assert(client->shared_requests_list->queues[i].client == -1);
+
+      assert(client->shared_completions_list->queues[i].active == 0);
+      assert(client->shared_completions_list->queues[i].client == -1);
+
+      // TODO: Thread Safe
+      request_queue_activate(&(client->shared_requests_list->queues[i]), i);
+      request_queue_activate(&(client->shared_completions_list->queues[i]), i);
+
+      client->shared_requests = &(client->shared_requests_list->queues[i]);
+      client->shared_completions =
+          &(client->shared_completions_list->queues[i]);
+
+      id = i;
+      break;
+    }
+  }
+
+  if (id == -1)
+    scoria_error("Client: Exceeded Maxmimum Number of Clients\n");
+
+  client->id = id;
+
+  if (client->chatty)
+    printf("Client: Assigned ID %d\n", client->id);
+}
+
+void init(struct client *client) {
+  init_virtual_address_mailbox(client);
+
+  read_location(client);
+
+  init_memory_pool(client);
+  init_requests(client);
+  init_completions(client);
+
+  init_id(client);
+
+  client->unmatched_requests = NULL;
+
+  printf("Client(%d): Connected to Controller Successfully\n", client->id);
+}
diff --git a/src/client/client_memory.c b/src/client/client_memory.c
new file mode 100644
index 0000000..b55a346
--- /dev/null
+++ b/src/client/client_memory.c
@@ -0,0 +1,84 @@
+#include "client_memory.h"
+
+#include "client.h"
+#include "config.h"
+#include "request.h"
+
+#include <semaphore.h>
+#include <stdio.h>
+
+static int rid = 0;
+
+void scoria_put_request(struct client *client, struct request *req) {
+  request_queue_put(client->shared_requests, req);
+
+  if (client->chatty)
+    printf("Client(%d): Added Request %d:%d to Request Queue %d\n", client->id,
+           req->client, req->id, client->id);
+}
+
+void scoria_quit(struct client *client, struct request *req) {
+  if (client->chatty)
+    printf("Client(%d): Quit Request\n", client->id);
+
+  req->client = client->id;
+  req->r_type = Quit;
+  req->id = rid;
+  rid++;
+
+  if (client->chatty)
+    printf("Client(%d): Created Request Object: Client: %d ID: %d Type: %d\n",
+           client->id, req->client, req->id, req->r_type);
+
+  scoria_put_request(client, req);
+}
+
+void scoria_read(struct client *client, void *buffer, const size_t N,
+                 void *output, const size_t *ind1, const size_t *ind2,
+                 size_t num_threads, bool use_avx, struct request *req) {
+  if (client->chatty)
+    printf("Client(%d): Reading Buffer\n", client->id);
+
+  req->client = client->id;
+  req->r_type = Read;
+  req->input = buffer;
+  req->output = output;
+  req->N = N;
+  req->ind1 = ind1;
+  req->ind2 = ind2;
+  req->nthreads = num_threads;
+  req->use_avx = use_avx;
+  req->id = rid;
+  rid++;
+
+  if (client->chatty)
+    printf("Client(%d): Create Request Object: Client: %d ID: %d Type: %d\n",
+           client->id, req->client, req->id, req->r_type);
+
+  scoria_put_request(client, req);
+}
+
+void scoria_write(struct client *client, void *buffer, const size_t N,
+                  void *input, const size_t *ind1, const size_t *ind2,
+                  size_t num_threads, bool use_avx, struct request *req) {
+  if (client->chatty)
+    printf("Client(%d): Writing Buffer\n", client->id);
+
+  req->client = client->id;
+  req->r_type = Write;
+  req->output = buffer;
+  req->input = input;
+  req->N = N;
+  req->ind1 = ind1;
+  req->ind2 = ind2;
+  req->nthreads = num_threads;
+  req->use_avx = use_avx;
+  req->id = rid;
+  rid++;
+
+  if (client->chatty)
+    printf("Client(%d): Created Request Object: Client: %d ID: %d Type: %d\n",
+           client->id, req->client, req->id, req->r_type);
+
+  scoria_put_request(client, req);
+}
diff --git a/src/client/client_place_requests.c b/src/client/client_place_requests.c
new file mode 100644
index 0000000..8de3ddd
--- /dev/null
+++ b/src/client/client_place_requests.c
@@ -0,0 +1,107 @@
+#include "client_place_requests.h"
+#include "client_memory.h"
+
+#include "client.h"
+#include "config.h"
+#include "request.h"
+
+#include "shm_malloc.h"
+#include "uthash.h"
+
+#include <stdio.h>
+
+void wait_request(struct client *client, struct request *req) {
+  if (client->chatty)
+    printf("Client(%d): Waiting on Request %d:%d\n", client->id, req->client,
+           req->id);
+
+  int found;
+  int id = req->id;
+
+  struct request *query;
+
+  HASH_FIND_INT(client->unmatched_requests, &id, query);
+  if (query == NULL)
+    found = 0;
+  else
+    found = 1;
+
+  while (!found) {
+    struct request complete;
+    request_queue_fetch(client->shared_completions, &complete);
+
+    if (complete.r_type == Kill) {
+      printf("Received a Kill Request Originating from a Quit Request from "
+             "Client(%d)\n",
+             complete.client);
+      exit(1);
+    }
+
+    if (complete.id == id) {
+      *req = complete;
+      struct request *find;
+
+      HASH_FIND_INT(client->unmatched_requests, &id, find);
+      if (find != NULL)
+        HASH_DEL(client->unmatched_requests, find);
+
+      found = 1;
+    } else
+      HASH_ADD_INT(client->unmatched_requests, id, &complete);
+  }
+
+  if (client->chatty)
+    printf("Client(%d): Controller Completed Request %d:%d\n", client->id,
+           req->client, id);
+}
+
+void wait_requests(struct client *client, struct request *reqs,
+                   size_t num_reqs) {
+  for (size_t i = 0; i < num_reqs; ++i)
+    wait_request(client, &reqs[i]);
+}
+
+void place_requests(struct client *client) {
+  // Allocate Buffer
+  double *A = shm_malloc(1024 * sizeof(int));
+
+  if (client->chatty)
+    printf("Client(%d): Received Pointer to Allocated Memory: %p\n", client->id,
+           (void *)A);
+
+  // Write to Buffer
+  printf("Client(%d): Writing Array:\n", client->id);
+
+  double *input = shm_malloc(1024 * sizeof(double));
+  for (size_t i = 0; i < 1024; ++i) {
+    input[i] = (double)(2 * i);
+  }
+
+  struct request req1;
+  scoria_write(client, A, 1024, input, NULL, NULL, 0, 0, &req1);
+  wait_request(client, &req1);
+  shm_free(input);
+
+  // Read from Buffer
+  printf("Client(%d): Reading Array:\n", client->id);
+
+  double *output = shm_malloc(1024 * sizeof(double));
+
+  struct request req2;
+  scoria_read(client, A, 1024, output, NULL, NULL, 0, 0, &req2);
+  wait_request(client, &req2);
+
+  for (size_t i = 0; i < 1024; ++i)
+    printf("%.2f ", output[i]);
+  printf("\n");
+  shm_free(output);
+
+  // Free Buffer
+  shm_free(A);
+
+  // Exit Program
+  // struct request req3;
+  // scoria_quit(client, &req3);
+
+  // wait_request(client, &req3);
+}
diff --git a/src/client/client_read_location.c b/src/client/client_read_location.c
new file mode 100644
index 0000000..444eb69
--- /dev/null
+++ b/src/client/client_read_location.c
@@ -0,0 +1,33 @@
+#include "client_read_location.h"
+
+#include "client.h"
+#include "config.h"
+
+#include <stdio.h>
+
+void read_location(struct client *client) {
+  if (client->chatty)
+    printf("Client: Waiting on Controller\n");
+
+  while (!client->shared_location->ready) {
+    ;
+    ;
+  }
+
+  client->shared_mem_ptr = client->shared_location->shared_mem_ptr;
+  client->shared_requests_list = client->shared_location->shared_requests_list;
+  client->shared_completions_list =
+      client->shared_location->shared_completions_list;
+
+  if (client->chatty) {
+    printf("Client: Received Shared Memory Addresses\n");
+    printf("Client: shared_mem_ptr %p %p\n", (void *)client->shared_mem_ptr,
+           (void *)client->shared_location->shared_mem_ptr);
+    printf("Client: shared_requests_list %p %p\n",
+           (void *)client->shared_requests_list,
+           (void *)client->shared_location->shared_requests_list);
+    printf("Client: shared_completions_list %p %p\n",
+           (void *)client->shared_completions_list,
+           (void *)client->shared_location->shared_completions_list);
+  }
+}
diff --git a/src/controller/CMakeLists.txt b/src/controller/CMakeLists.txt
new file mode 100644
index 0000000..e1c3a99
--- /dev/null
+++ b/src/controller/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.10)
+project(memory-accelerator C)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_compile_options(-Wall -Wextra -pedantic -Werror)
+
+set(CONTROLLER_SOURCE_FILES ${CONTROLLER_SOURCE_FILES} 
+                            ${CMAKE_CURRENT_SOURCE_DIR}/controller_cleanup.c  
+                            ${CMAKE_CURRENT_SOURCE_DIR}/controller_handle_requests.c  
+                            ${CMAKE_CURRENT_SOURCE_DIR}/controller_init.c  
+                            ${CMAKE_CURRENT_SOURCE_DIR}/controller_write_location.c
+                            PARENT_SCOPE)
diff --git a/src/controller/controller_cleanup.c b/src/controller/controller_cleanup.c
new file mode 100644
index 0000000..5dd0cfd
--- /dev/null
+++ b/src/controller/controller_cleanup.c
@@ -0,0 +1,40 @@
+#include "controller_cleanup.h"
+#include "posix_sm.h"
+#include "request.h"
+#include "utils.h"
+
+#include "shm_malloc.h"
+
+#include <unistd.h>
+
+void cleanup_shared_mem(struct controller *controller) {
+  request_queue_list_free(controller->shared_requests_list);
+  request_queue_list_free(controller->shared_completions_list);
+
+  scoria_sm_unmap(controller->shared_location, sizeof(struct memory_location),
+                  "controller:unmap:shared_location");
+  scoria_sm_unlink(SHARED_LOCATION_NAME,
+                   "controller:sem_unlink:shared_location");
+
+  scoria_sm_unmap(controller->shared_requests_list,
+                  sizeof(struct request_queue_list),
+                  "controller:unmap:shared_requests");
+  scoria_sm_unlink(SHARED_REQUESTS_NAME,
+                   "controller:sem_unlink:shared_requests");
+
+  scoria_sm_unmap(controller->shared_completions_list,
+                  sizeof(struct request_queue_list),
+                  "controller:unmap:shared_completions");
+  scoria_sm_unlink(SHARED_COMPLETIONS_NAME,
+                   "controller:sem_unlink:shared_completions");
+
+  close(controller->fd_location);
+  close(controller->fd_requests);
+  close(controller->fd_completions);
+}
+
+void cleanup(struct controller *controller) {
+  cleanup_shared_mem(controller);
+
+  shm_destroy();
+}
diff --git a/src/controller/controller_handle_requests.c b/src/controller/controller_handle_requests.c
new file mode 100644
index 0000000..fba3693
--- /dev/null
+++ b/src/controller/controller_handle_requests.c
@@ -0,0 +1,232 @@
+#include "controller_handle_requests.h"
+
+#include "config.h"
+#include "controller.h"
+#include "kernels.h"
+#include "request.h"
+#include "utils.h"
+
+#include "shm_malloc.h"
+
+#include <stdio.h>
+
+int quit = 0;
+int tid = -1;
+
+void handle_read(struct controller *controller, struct request_queue *queue,
+                 struct request *req) {
+  if (controller->chatty)
+    printf("Controller: Received Request Object: Client: %d ID: %d Type: %d N: "
+           "%ld\n",
+           req->client, req->id, req->r_type, req->N);
+
+  if (req->ind1 == NULL) {
+    assert(req->ind2 == NULL);
+    if (req->nthreads == 0) {
+      read_single_thread_0(req->output, req->input, req->N, req->use_avx);
+    } else {
+      read_multi_thread_0(req->output, req->input, req->N, req->nthreads,
+                          req->use_avx);
+    }
+
+    req->r_status = Ready;
+
+    request_queue_put(queue, req);
+
+    if (controller->chatty)
+      printf("Controller: Client(%d) Read Data with N: %ld\n", req->client,
+             req->N);
+
+    return;
+  }
+
+  if (req->ind2 == NULL) {
+    assert(req->ind1 != NULL);
+    if (req->nthreads == 0) {
+      read_single_thread_1(req->output, req->input, req->N, req->ind1,
+                           req->use_avx);
+    } else {
+      read_multi_thread_1(req->output, req->input, req->N, req->ind1,
+                          req->nthreads, req->use_avx);
+    }
+
+    req->r_status = Ready;
+
+    request_queue_put(queue, req);
+
+    if (controller->chatty)
+      printf("Controller: Client(%d) Read Data with N: %ld\n", req->client,
+             req->N);
+
+    return;
+  }
+
+  assert(req->ind1 != NULL);
+  assert(req->ind2 != NULL);
+
+  if (req->nthreads == 0) {
+    read_single_thread_2(req->output, req->input, req->N, req->ind1, req->ind2,
+                         req->use_avx);
+  } else {
+    read_multi_thread_2(req->output, req->input, req->N, req->ind1, req->ind2,
+                        req->nthreads, req->use_avx);
+  }
+
+  req->r_status = Ready;
+
+  request_queue_put(queue, req);
+
+  if (controller->chatty)
+    printf("Controller: Client(%d) Read Data with N: %ld\n", req->client,
+           req->N);
+}
+
+void handle_write(struct controller *controller, struct request_queue *queue,
+                  struct request *req) {
+  if (controller->chatty)
+    printf("Controller: Received Request Object: Client: %d ID: %d Type: %d "
+           "Pointer: %p Input Pointer: %p N: %ld\n",
+           req->client, req->id, req->r_type, (void *)req->output,
+           (void *)req->input, req->N);
+
+  if (req->ind1 == NULL) {
+    assert(req->ind2 == NULL);
+    if (req->nthreads == 0) {
+      write_single_thread_0(req->output, req->input, req->N, req->use_avx);
+    } else {
+      write_multi_thread_0(req->output, req->input, req->N, req->nthreads,
+                           req->use_avx);
+    }
+
+    req->r_status = Ready;
+
+    request_queue_put(queue, req);
+
+    if (controller->chatty)
+      printf("Controller: Client(%d) Wrote Data with N: %ld\n", req->client,
+             req->N);
+
+    return;
+  }
+
+  if (req->ind2 == NULL) {
+    assert(req->ind1 != NULL);
+    if (req->nthreads == 0) {
+      write_single_thread_1(req->output, req->input, req->N, req->ind1,
+                            req->use_avx);
+    } else {
+      write_multi_thread_1(req->output, req->input, req->N, req->ind1,
+                           req->nthreads, req->use_avx);
+    }
+
+    req->r_status = Ready;
+
+    request_queue_put(queue, req);
+
+    if (controller->chatty)
+      printf("Controller: Client(%d) Wrote Data with N: %ld\n", req->client,
+             req->N);
+
+    return;
+  }
+
+  assert(req->ind1 != NULL);
+  assert(req->ind2 != NULL);
+
+  if (req->nthreads == 0) {
+    write_single_thread_2(req->output, req->input, req->N, req->ind1, req->ind2,
+                          req->use_avx);
+  } else {
+    write_multi_thread_2(req->output, req->input, req->N, req->ind1, req->ind2,
+                         req->nthreads, req->use_avx);
+  }
+
+  req->r_status = Ready;
+
+  request_queue_put(queue, req);
+
+  if (controller->chatty)
+    printf("Controller: Client(%d) Wrote Data with N: %ld\n", req->client,
+           req->N);
+}
+
+void *handler(void *args) {
+  struct thread_args *a = args;
+
+  size_t i = a->i;
+  struct controller *controller = a->controller;
+
+  struct request_queue *requests =
+      &(controller->shared_requests_list->queues[i]);
+  struct request_queue *completions =
+      &(controller->shared_completions_list->queues[i]);
+
+  while (!quit) {
+    struct request req;
+    request_queue_fetch(requests, &req);
+
+    if (controller->chatty)
+      printf("Controller: Client (%ld): Request %d Detected\n", i, req.id);
+
+    switch (req.r_type) {
+    case Read:
+      handle_read(controller, completions, &req);
+      break;
+    case Write:
+      handle_write(controller, completions, &req);
+      break;
+    case Quit:
+      tid = i;
+      quit = 1;
+      req.r_status = Ready;
+      request_queue_put(completions, &req);
+      break;
+    case Kill:
+      quit = 1;
+      break;
+    default:
+      printf("Controller: Client (%ld): Invalid Request Type Detected\n", i);
+      tid = i;
+      quit = 1;
+    }
+  }
+
+  return NULL;
+}
+
+void handle_requests(struct controller *controller) {
+  // Start loop
+  pthread_t threads[MAX_CLIENTS];
+  struct thread_args args[MAX_CLIENTS];
+
+  for (size_t i = 0; i < MAX_CLIENTS; ++i) {
+    args[i].i = i;
+    args[i].controller = controller;
+
+    int ret = pthread_create(&threads[i], NULL, handler, &args[i]);
+    assert(ret == 0);
+  }
+
+  while (!quit) {
+    ;
+    ;
+  }
+
+  for (int i = 0; i < MAX_CLIENTS; ++i) {
+    if (i != tid) {
+      struct request req;
+      req.r_type = Kill;
+      req.id = -1;
+      req.client = tid;
+      request_queue_put(&(controller->shared_completions_list->queues[i]),
+                        &req);
+      request_queue_put(&(controller->shared_requests_list->queues[i]), &req);
+    }
+  }
+
+  for (int i = 0; i < MAX_CLIENTS; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  printf("Controller: Quit Request Received from Client(%d)\n", tid);
+}
diff --git a/src/controller/controller_init.c b/src/controller/controller_init.c
new file mode 100644
index 0000000..eecea79
--- /dev/null
+++ b/src/controller/controller_init.c
@@ -0,0 +1,94 @@
+#include "controller_init.h"
+#include "controller_write_location.h"
+
+#include "config.h"
+#include "controller.h"
+#include "posix_sm.h"
+#include "request.h"
+#include "utils.h"
+
+#include "shm_malloc.h"
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+void init_files() {
+  if (access(SHARED_MEMORY_NAME, F_OK) == 0)
+    if (unlink(SHARED_MEMORY_NAME) == -1)
+      scoria_error("controller:unlink:shared_memory");
+}
+
+void init_memory_pool(struct controller *controller) {
+  if (shm_init(SHARED_MEMORY_NAME, setup) < 0)
+    scoria_error("Controller:shm_init");
+
+  controller->shared_mem_ptr = shm_global();
+}
+
+void init_requests(struct controller *controller) {
+  controller->fd_requests =
+      scoria_sm_open(SHARED_REQUESTS_NAME, O_RDWR | O_CREAT | O_TRUNC, 0660,
+                     "controller:shm_open");
+  scoria_sm_truncate(controller->fd_requests, sizeof(struct request_queue_list),
+                     "controller:ftruncate");
+  controller->shared_requests_list = scoria_sm_map(
+      NULL, sizeof(struct request_queue_list), PROT_READ | PROT_WRITE,
+      MAP_SHARED, controller->fd_requests, 0, "controller:mmap");
+
+  request_queue_list_init(controller->shared_requests_list);
+
+  if (controller->chatty)
+    printf("Controller: Shared Request Address: %p\n",
+           (void *)controller->shared_requests_list);
+}
+
+void init_completions(struct controller *controller) {
+  controller->fd_completions =
+      scoria_sm_open(SHARED_COMPLETIONS_NAME, O_RDWR | O_CREAT | O_TRUNC, 0660,
+                     "controller:shm_open");
+  scoria_sm_truncate(controller->fd_completions,
+                     sizeof(struct request_queue_list), "controller:ftruncate");
+  controller->shared_completions_list = scoria_sm_map(
+      NULL, sizeof(struct request_queue_list), PROT_READ | PROT_WRITE,
+      MAP_SHARED, controller->fd_completions, 0, "controller:mmap");
+
+  request_queue_list_init(controller->shared_completions_list);
+
+  if (controller->chatty)
+    printf("Controller: Shared Completions Address: %p\n",
+           (void *)controller->shared_completions_list);
+}
+
+void init_virtual_address_mailbox(struct controller *controller) {
+  controller->fd_location =
+      scoria_sm_open(SHARED_LOCATION_NAME, O_RDWR | O_CREAT | O_TRUNC, 0660,
+                     "controller:shm_open");
+  scoria_sm_truncate(controller->fd_location, sizeof(struct memory_location),
+                     "controller:ftruncate");
+  controller->shared_location = scoria_sm_map(
+      NULL, sizeof(struct memory_location), PROT_READ | PROT_WRITE, MAP_SHARED,
+      controller->fd_location, 0, "controller:mmap");
+
+  controller->shared_location->ready = 0;
+  controller->shared_location->shared_mem_ptr = NULL;
+  controller->shared_location->shared_requests_list = NULL;
+  controller->shared_location->shared_completions_list = NULL;
+
+  if (controller->chatty)
+    printf("Controller: Shared Location: %p\n",
+           (void *)controller->shared_location);
+}
+
+void init(struct controller *controller) {
+  init_files();
+
+  init_virtual_address_mailbox(controller);
+
+  init_memory_pool(controller);
+  init_requests(controller);
+  init_completions(controller);
+
+  write_location(controller);
+}
diff --git a/src/controller/controller_write_location.c b/src/controller/controller_write_location.c
new file mode 100644
index 0000000..27f8771
--- /dev/null
+++ b/src/controller/controller_write_location.c
@@ -0,0 +1,29 @@
+#include "controller_write_location.h"
+
+#include "config.h"
+#include "controller.h"
+
+#include <stdio.h>
+
+void write_location(struct controller *controller) {
+  controller->shared_location->shared_mem_ptr = controller->shared_mem_ptr;
+  controller->shared_location->shared_requests_list =
+      controller->shared_requests_list;
+  controller->shared_location->shared_completions_list =
+      controller->shared_completions_list;
+
+  controller->shared_location->ready = 1;
+
+  if (controller->chatty) {
+    printf("Controller: Posted Shared Memory Addresses\n");
+    printf("Controller: shared_mem_ptr %p %p\n",
+           (void *)controller->shared_location->shared_mem_ptr,
+           (void *)controller->shared_mem_ptr);
+    printf("Controller: shared_requests_list %p %p\n",
+           (void *)controller->shared_location->shared_requests_list,
+           (void *)controller->shared_requests_list);
+    printf("Controller: shared_completions_list %p %p\n",
+           (void *)controller->shared_location->shared_completions_list,
+           (void *)controller->shared_completions_list);
+  }
+}
diff --git a/src/posix/CMakeLists.txt b/src/posix/CMakeLists.txt
new file mode 100644
index 0000000..745ff76
--- /dev/null
+++ b/src/posix/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.10)
+project(memory-accelerator C)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_compile_options(-Wall -Wextra -pedantic -Werror)
+
+set(SHARED_SOURCE_FILES ${SHARED_SOURCE_FILES} 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/posix_sm.c
+                        PARENT_SCOPE)
diff --git a/src/posix/posix_sm.c b/src/posix/posix_sm.c
new file mode 100644
index 0000000..efb1277
--- /dev/null
+++ b/src/posix/posix_sm.c
@@ -0,0 +1,43 @@
+#include "posix_sm.h"
+#include "config.h"
+#include "utils.h"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+int scoria_sm_open(const char *name, int oflag, mode_t mode, const char *msg) {
+  int fd;
+
+  if ((fd = shm_open(name, oflag, mode)) == -1)
+    scoria_error(msg);
+
+  return fd;
+}
+
+void scoria_sm_unlink(const char *name, const char *msg) {
+  if (shm_unlink(name) == -1)
+    scoria_error(msg);
+}
+
+void scoria_sm_truncate(const int fd, const size_t length, const char *msg) {
+  if (ftruncate(fd, length) == -1)
+    scoria_error(msg);
+}
+
+void *scoria_sm_map(void *addr, const size_t length, const int prot,
+                    const int flags, const int fd, const off_t offset,
+                    const char *msg) {
+  void *ptr;
+
+  if ((ptr = mmap(addr, length, prot, flags, fd, offset)) == MAP_FAILED)
+    scoria_error(msg);
+
+  return ptr;
+}
+
+void scoria_sm_unmap(void *ptr, const size_t length, const char *msg) {
+  if (munmap(ptr, length) == -1)
+    scoria_error(msg);
+}
diff --git a/src/shared/CMakeLists.txt b/src/shared/CMakeLists.txt
new file mode 100644
index 0000000..a988b60
--- /dev/null
+++ b/src/shared/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+project(memory-accelerator C)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+add_compile_options(-Wall -Wextra -pedantic -Werror)
+
+set(SHARED_SOURCE_FILES ${SHARED_SOURCE_FILES} 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/request.c 
+                        ${CMAKE_CURRENT_SOURCE_DIR}/utils.c
+                        PARENT_SCOPE)
diff --git a/src/shared/request.c b/src/shared/request.c
new file mode 100644
index 0000000..e8d6f9b
--- /dev/null
+++ b/src/shared/request.c
@@ -0,0 +1,130 @@
+#include "request.h"
+
+#include "shm_malloc.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void request_queue_init(request_queue *rq) {
+  rq->client = -1;
+  rq->active = 0;
+
+  rq->begin = &(rq->requests[0]);
+  rq->end = &(rq->requests[REQUEST_QUEUE_SIZE - 1]);
+
+  rq->head = rq->begin;
+  rq->tail = rq->begin;
+
+  rq->count = 0;
+  rq->size = sizeof(struct request);
+
+  rq->capacity = REQUEST_QUEUE_SIZE;
+
+  pthread_mutexattr_init(&(rq->attr_lock));
+  pthread_mutexattr_setpshared(&(rq->attr_lock), PTHREAD_PROCESS_SHARED);
+
+  pthread_mutex_init(&(rq->lock), &(rq->attr_lock));
+
+  pthread_condattr_init(&(rq->attr_empty));
+  pthread_condattr_setpshared(&(rq->attr_empty), PTHREAD_PROCESS_SHARED);
+
+  pthread_cond_init(&(rq->empty), &(rq->attr_empty));
+
+  pthread_condattr_init(&(rq->attr_fill));
+  pthread_condattr_setpshared(&(rq->attr_fill), PTHREAD_PROCESS_SHARED);
+
+  pthread_cond_init(&(rq->fill), &(rq->attr_fill));
+}
+
+void request_queue_free(request_queue *rq) {
+  pthread_mutex_destroy(&(rq->lock));
+  pthread_mutexattr_destroy(&(rq->attr_lock));
+
+  pthread_cond_destroy(&(rq->empty));
+  pthread_condattr_destroy(&(rq->attr_empty));
+
+  pthread_cond_destroy(&(rq->fill));
+  pthread_condattr_destroy(&(rq->attr_fill));
+}
+
+void request_queue_put(request_queue *rq, const struct request *item) {
+  pthread_mutex_lock(&(rq->lock));
+
+  while (rq->count == rq->capacity)
+    pthread_cond_wait(&(rq->empty), &(rq->lock));
+
+  memcpy(rq->head, item, rq->size);
+
+  rq->head = rq->head + 1;
+
+  if (rq->head == rq->end)
+    rq->head = rq->begin;
+
+  rq->count++;
+
+  pthread_cond_signal(&(rq->fill));
+  pthread_mutex_unlock(&(rq->lock));
+}
+
+void request_queue_fetch(request_queue *rq, struct request *item) {
+  pthread_mutex_lock(&(rq->lock));
+
+  while (rq->count == 0)
+    pthread_cond_wait(&(rq->fill), &(rq->lock));
+
+  memcpy(item, rq->tail, rq->size);
+
+  rq->tail = rq->tail + 1;
+
+  if (rq->tail == rq->end)
+    rq->tail = rq->begin;
+
+  rq->count--;
+
+  pthread_cond_signal(&(rq->empty));
+  pthread_mutex_unlock(&(rq->lock));
+}
+
+void request_queue_activate(request_queue *rq, int id) {
+  pthread_mutex_lock(&(rq->lock));
+
+  assert(rq->client == -1);
+  assert(rq->active == 0);
+
+  rq->client = id;
+  rq->active = 1;
+
+  pthread_mutex_unlock(&(rq->lock));
+}
+
+void request_queue_deactivate(request_queue *rq) {
+  pthread_mutex_lock(&(rq->lock));
+
+  rq->client = -1;
+  rq->active = 0;
+
+  rq->begin = &(rq->requests[0]);
+  rq->end = &(rq->requests[REQUEST_QUEUE_SIZE - 1]);
+
+  rq->head = rq->begin;
+  rq->tail = rq->begin;
+
+  rq->count = 0;
+  rq->size = sizeof(struct request);
+
+  rq->capacity = REQUEST_QUEUE_SIZE;
+
+  pthread_mutex_unlock(&(rq->lock));
+}
+
+void request_queue_list_init(request_queue_list *rql) {
+  for (size_t i = 0; i < MAX_CLIENTS; ++i)
+    request_queue_init(&(rql->queues[i]));
+}
+
+void request_queue_list_free(request_queue_list *rql) {
+  for (size_t i = 0; i < MAX_CLIENTS; ++i)
+    request_queue_free(&(rql->queues[i]));
+}
diff --git a/src/shared/utils.c b/src/shared/utils.c
new file mode 100644
index 0000000..4eefb77
--- /dev/null
+++ b/src/shared/utils.c
@@ -0,0 +1,27 @@
+#include "utils.h"
+#include "config.h"
+
+#include "shm_malloc.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void setup() {
+  struct shared_memory *shared_mem_ptr =
+      shm_malloc(sizeof(struct shared_memory));
+
+  if (!shared_mem_ptr) {
+    // TODO: Handle Error
+  }
+
+  shared_mem_ptr->head = 0;
+  shared_mem_ptr->tail = &shared_mem_ptr->head;
+
+  shm_set_global(shared_mem_ptr);
+}
+
+void scoria_error(const char *msg) {
+  perror(msg);
+  exit(1);
+}
diff --git a/src/shm_malloc/CMakeLists.txt b/src/shm_malloc/CMakeLists.txt
new file mode 100644
index 0000000..127060f
--- /dev/null
+++ b/src/shm_malloc/CMakeLists.txt
@@ -0,0 +1,6 @@
+cmake_minimum_required(VERSION 3.10)
+project(shm_malloc C)
+
+add_library(shm STATIC malloc.c malloc.h)
+target_compile_definitions(shm PUBLIC SHM)
+target_include_directories(shm PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/src/shm_malloc/README b/src/shm_malloc/README
new file mode 100644
index 0000000..dd0b706
--- /dev/null
+++ b/src/shm_malloc/README
@@ -0,0 +1,130 @@
+NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY
+
+THIS CODE IS RELESED AS IS, WITH NO WARRANTY OF ANY KIND.
+
+NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY
+
+
+These malloc routines are based primarily on the paper "Efficient Kernel
+Memory Allocation on Shared-Memory Multiprocessors", by McKenney and
+Slingwine, which appears in the USENIX Winter 1993 conference proceedings.
+
+The basic idea is to reduce the number of interlocks required by using
+small per-process free lists.  Interlocks are only required when the
+free lists are empty or overflow.
+
+This per-process free list idea is used only for blocks smaller than 1
+page.  For these small blocks, all allocations are rounded up to a
+power-of-two size and one free list is used for each size.  Blocks are
+allocated by splitting an entire page into blocks of the same size, so
+that no extra per-block storage is required for in-use blocks.
+
+Larger allocations are always rounded up to a multiple of the page size.
+Free pages are kept in a sorted list, and a best-fit allocation scheme
+is used.
+
+When compiled directly, the source will produce malloc replacement code.
+
+When compiled with -DMALLOC_DEBUG, it will add in extra guard and consistency
+checks as an aid to debugging heap corruption problems.
+
+When compiled with -DSHM, the source will produce a shared-memory malloc
+package, which uses mmap to get memory from the system (so it may also
+be used as persistent storage).  There are a number of important routines.
+
+void *shm_malloc(size_t);
+void *shm_calloc(int, int);
+void *shm_realloc(void *, size_t);
+void shm_free(void *);
+    These are used instead of the standard malloc routines.
+
+void shm_init(char *filename [, void (*init)()]);
+    This must be called in each process to get access to the shared memory.
+    If the file doesn't (yet) exist, it will be created and initialized to
+    `all free'.  It contains code to avoid race conditions, so you can call
+    it in multiple processes simultaneously and the right thing will happen
+
+    The optional init function will be called if this is a newly created
+    file.  Only one process will call this function if multiple processes
+    call shm_init on a new file simultaneously; the others will wait for it
+    to finish.
+
+void shm_fini();
+    This must be called in each process before exiting (unless shm_destroy
+    is called).  It frees up all the process internal tables and `disconnects'
+    from the shared memory.
+
+void shm_destroy();
+    This works as shm_fini and also removes the shared memory file.  Other
+    processes which are trying to use the shared memory may fail mysteriously
+    after this is called.
+
+void shm_child();
+    This should be called in the child process if a process calls fork
+    after it calls shm_init, before the child process calls any other
+    shared memory routine.  The child can safely call exit or exec
+    without calling shm_child, as long as it calls no shared memory
+    routines before then.
+
+void *shm_global();
+void shm_set_global(void *);
+    These two routines give all process who have mapped a shared memory file
+    (with shm_init) acces to a single global variable.  Usually a pointer
+    into the shared memory.  Initialized to 0 when shm_init is called for
+    a non-existant file.
+
+The code is arranged so that all processes will map the memory at the same
+address.  This precludes the possibility of mapping multiple shared memory
+files into one process, but its almost impossible to work without it.  The
+shared memory load address is a constant that will need to be changed for
+any port.
+
+You have a choice of 4 different locking schemes for the global tables:
+System V semaphores, File locks, atomic test-and-set spin locks, and
+pthreads mutexes.  You can choose between them by compiling with one of:
+    -DLOCKTYPE=SYSVSEM
+    -DLOCKTYPE=FLOCK
+    -DLOCKTYPE=SPINLOCK
+    -DLOCKTYPE=PMUTEX
+Put this into the `CFLAGS' in the Makefile, or use it on the command line
+when compiling malloc.c manually.  The default is posix mutexes on posix
+systems, otherwise spinlocks if it is supported, otherwise file locks.
+Currently spinlocks are only supported when using GCC on the following
+processors:
+    m68k
+    m88k
+    sparc
+    alpha
+    i386
+    ppc
+    x86_64
+To add support for an additional processor, you'll need to add appropriate
+macros to `atomic.h'.
+
+I've tested this code on the following machines:
+    sparc SunOS 4
+    sparc SunOS 5
+    mips IRIX 5
+    alpha OSF/1
+    i586 FreeBSD 2.1.5
+    rs6k AIX 4.2
+    Linux 2.6.38
+
+On FreeBSD and AIX, the code does not work if the shared memory file is on
+an NFS-mounted partition.  On all systems, its a bit slower to use an NFS
+file, so you're better off using a local file if possible.
+
+Note that the fact that you can use a file on a NFS server does NOT mean
+that you can use this code for distributed shared memory -- it won't
+work.  In general ANY TIME two different client machines try to modify
+the same file on a NFS server, all bets are off and everything will
+break.  NFS is an abbreviation for `Not a File System.'
+
+On hppa HP-UX 9 machines, this code is known to crash the machine.  On
+i86 FreeBSD 2.0 it also frequently crashes the machine.
+
+I'd appreciate reciving a copy of any bug-fixes, enhancements, or ports
+anyone makes with this code.
+
+Chris Dodd
+dodd@csl.sri.com
diff --git a/src/shm_malloc/atomic.h b/src/shm_malloc/atomic.h
new file mode 100644
index 0000000..67b91f6
--- /dev/null
+++ b/src/shm_malloc/atomic.h
@@ -0,0 +1,250 @@
+#ifndef __atomic_h__
+#define __atomic_h__
+
+#ifndef __GNUC__
+#error This file requires GCC
+#else
+
+#if defined(mc68000)
+typedef unsigned int word_t;
+
+typedef unsigned int TAS_t;
+#define TAS(m)                                                                 \
+  ({                                                                           \
+    register TAS_t _t_tas;                                                     \
+    asm volatile("tas (%1); smi %0" : "=g"(_t_tas) : "a"(&(m)));               \
+    _t_tas;                                                                    \
+  })
+
+#if defined(mc68020)
+#define CAS(m, c, u)                                                           \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("cas %0,%1,(%2)" : "=d"(_o) : "d"(u), "a"(&(m)), "0"(c));     \
+    _o;                                                                        \
+  })
+#define CAS2(m1, c1, u1, m2, c2, u2)                                           \
+  asm volatile("cas2 %0:%1,%2:%3,(%4):(%5)"                                    \
+               : "=d"(c1), "=d"(c2)                                            \
+               : "d"(u1), "d"(u2), "g"(&(m1)), "g"(&(m2)), "0"(c1), "1"(c2))
+#endif /* mc68020 */
+
+#elif defined(__i386__)
+typedef unsigned int word_t;
+#define SWAP(m, v)                                                             \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m));                   \
+    _o;                                                                        \
+  })
+#define SWAPB(m, v)                                                            \
+  ({                                                                           \
+    register unsigned char _o;                                                 \
+    asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m));                   \
+    _o;                                                                        \
+  })
+
+#elif defined(__x86_64__)
+typedef unsigned long word_t;
+#define SWAP(m, v)                                                             \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m));                   \
+    _o;                                                                        \
+  })
+#define SWAPB(m, v)                                                            \
+  ({                                                                           \
+    register unsigned char _o;                                                 \
+    asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m));                   \
+    _o;                                                                        \
+  })
+
+#elif defined(sparc)
+typedef unsigned int word_t;
+
+#define SWAP(m, v)                                                             \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("swap [%2],%0" : "=r"(_o) : "0"(v), "r"(&(m)));               \
+    _o;                                                                        \
+  })
+
+#define SWAPB(m, v)                                                            \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("ldstub [%2],%0" : "=r"(_o) : "0"(v), "r"(&(m)));             \
+    _o;                                                                        \
+  })
+
+#elif defined(m88k)
+typedef unsigned int word_t;
+
+#define SWAP(m, v)                                                             \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("xmem %0,%1,0" : "=r"(_o) : "r"(&(m)), "0"(v));               \
+    _o;                                                                        \
+  })
+
+#elif defined(__alpha__)
+typedef unsigned long word_t;
+
+#define RW_NONSTRICT 1
+#define MEMORY_BARRIER asm volatile("mb")
+#define LOAD_LOCK(m)                                                           \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("ldq_l %0,%1" : "=r"(_o) : "m"(m));                           \
+    _o;                                                                        \
+  })
+#define STORE_LOCK(m, v)                                                       \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("stq_c %0,%1" : "=r"(_o) : "m"(m), "0"(v));                   \
+    _o;                                                                        \
+  })
+
+#elif defined(__ppc__)
+typedef unsigned long word_t;
+
+#define RW_NONSTRICT 1
+#define MEMORY_BARRIER asm volatile("eieio")
+#define LOAD_LOCK(m)                                                           \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    asm volatile("lwarx %0,0,%1" : "=r"(_o) : "r"(&(m)));                      \
+    _o;                                                                        \
+  })
+#define STORE_LOCK(m, v)                                                       \
+  ({                                                                           \
+    register int _o = 0;                                                       \
+    asm volatile("stwcx. %2,0,%1\n"                                            \
+                 "\tbc 5,2,$+8\n"                                              \
+                 "\tori %0,%0,1"                                               \
+                 : "=r"(_o)                                                    \
+                 : "r"(&(m)), "r"(v), "0"(_o));                                \
+    _o;                                                                        \
+  })
+
+#else
+#error Unknown machine type
+#endif
+
+#if !defined(MEMORY_BARRIER)
+#define MEMORY_BARRIER
+#endif /* !MEMORY_BARRIER */
+
+#if !defined(ATOMSET)
+#define ATOMSET(m, v)                                                          \
+  ({                                                                           \
+    register word_t _o;                                                        \
+    MEMORY_BARRIER;                                                            \
+    _o = (m) = (v);                                                            \
+    MEMORY_BARRIER;                                                            \
+    _o;                                                                        \
+  })
+#endif /* !ATOMSET */
+
+#if defined(LOAD_LOCK) && !defined(TAS)
+typedef word_t TAS_t;
+#define TAS(m)                                                                 \
+  ({                                                                           \
+    register word_t *_m = (word_t *)&(m);                                      \
+    LOAD_LOCK(*_m) ? 1 : !STORE_LOCK(*_m, 1);                                  \
+  })
+#endif
+
+#if defined(LOAD_LOCK) && !defined(CAS)
+#define CAS(m, c, u)                                                           \
+  ({                                                                           \
+    register word_t _o, _t = (u);                                              \
+    register word_t *_m = (word_t *)&(m);                                      \
+    do {                                                                       \
+      if ((_o = LOAD_LOCK(*_m)) != (c))                                        \
+        break;                                                                 \
+    } while (!STORE_LOCK(*_m, _t));                                            \
+    _o;                                                                        \
+  })
+#endif
+
+#if defined(LOAD_LOCK) && !defined(SWAP)
+#define SWAP(m, v)                                                             \
+  ({                                                                           \
+    register word_t _o, _v = (v);                                              \
+    register word_t *_m = (word_t *)&(m);                                      \
+    do {                                                                       \
+      _o = LOAD_LOCK(*_m);                                                     \
+    } while (!STORE_LOCK(*_m, _v));                                            \
+    _o;                                                                        \
+  })
+#endif
+
+#if defined(LOAD_LOCK) && !defined(ATOMADD)
+#define ATOMADD(m, v)                                                          \
+  ({                                                                           \
+    register word_t _o, _v = (v);                                              \
+    register word_t *_m = (word_t *)&(m);                                      \
+    do {                                                                       \
+      _o = LOAD_LOCK(*_m) + _v;                                                \
+    } while (!STORE_LOCK(*_m, _o));                                            \
+    _o;                                                                        \
+  })
+#endif
+
+#if defined(SWAPB) && !defined(TAS)
+typedef unsigned char TAS_t;
+#define TAS(m) SWAPB(m, 1)
+#endif
+
+#if defined(SWAP) && !defined(TAS)
+typedef word_t TAS_t;
+#define TAS(m) SWAP(m, 1)
+#endif
+
+#if defined(CAS) && !defined(TAS)
+typedef word_t TAS_t;
+#define TAS(m) CAS(m, 0, 1)
+#endif
+
+#if defined(CAS) && !defined(SWAP)
+#define SWAP(m, v)                                                             \
+  ({                                                                           \
+    register word_t _t_c1, _t_c2;                                              \
+    _t_c2 = _t_c1 = (m);                                                       \
+    while ((_t_c1 = CAS(m, _t_c1, v)) != _t_c2)                                \
+      ;                                                                        \
+    _t_c2 = _t_c1;                                                             \
+    _t_c1;                                                                     \
+  })
+#endif
+
+#if defined(CAS) && !defined(ATOMADD)
+#define ATOMADD(m, d)                                                          \
+  ({                                                                           \
+    word_t _v, _ov, _d;                                                        \
+    _d = (d);                                                                  \
+    _ov = _v = (m);                                                            \
+    while ((_v = CAS(m, _v, _v + _d)) != _ov)                                  \
+      _ov = _v;                                                                \
+    _v + _d;                                                                   \
+  })
+#endif
+
+#if defined(SWAP) && !defined(ATOMADD)
+#define ATOMADD(m, d)                                                          \
+  ({                                                                           \
+    word_t _v, _ov, _d;                                                        \
+    _d = d;                                                                    \
+    _v = m;                                                                    \
+    while (_d) {                                                               \
+      _ov = _v;                                                                \
+      _v = SWAP(m, _v + _d);                                                   \
+      _d = _v - _ov;                                                           \
+    }                                                                          \
+    _v + _d;                                                                   \
+  })
+#endif
+
+#endif /* __GNUC__ */
+
+#endif /* __atomic_h__ */
diff --git a/src/shm_malloc/make/Makefile b/src/shm_malloc/make/Makefile
new file mode 100644
index 0000000..48f3135
--- /dev/null
+++ b/src/shm_malloc/make/Makefile
@@ -0,0 +1,53 @@
+
+# The following uses gcc with optimizing and debugging symbols
+CC = gcc
+CFLAGS = -ggdb -O3 -Wall
+
+MAKELIB = rm -f $@; ar qv $@ $^; ranlib $@
+
+SRCS = malloc.c malloc.h atomic.h tshm1.c tshm2.c Makefile malloc.doc shm_malloc.h
+MOBJS = malloc.o
+SHMOBJS = shm_malloc.o
+DBOBJS = db_malloc.o
+DBSHMOBJS = db_shm_malloc.o
+
+all: libmalloc.a libshm.a libdbmalloc.a libdbshm.a tshm1 tshm1db tanon tshm2 tshm2db
+
+tshm1: tshm1.c $(SHMOBJS)
+	$(CC) $(CFLAGS) -o $@ $^
+
+tshm1db: tshm1.c $(DBSHMOBJS)
+	$(CC) $(CFLAGS) -DSHM_FILE='"tshm1db_file"' -o $@ $^
+
+tanon: tshm1.c $(SHMOBJS)
+	$(CC) $(CFLAGS) -DSHM_FILE=0 -o $@ $^
+
+tshm2: tshm2.c $(SHMOBJS)
+	$(CC) $(CFLAGS) -o $@ $^
+
+tshm2db: tshm2.c $(DBSHMOBJS)
+	$(CC) $(CFLAGS) -o $@ $^
+
+libmalloc.a: $(MOBJS); $(MAKELIB)
+libshm.a: $(SHMOBJS); $(MAKELIB)
+libdbmalloc.a: $(DBOBJS); $(MAKELIB)
+libdbshm.a: $(DBSHMOBJS); $(MAKELIB)
+
+malloc.o: malloc.h
+
+shm_malloc.o: malloc.c malloc.h
+	$(CC) -c $(CFLAGS) -DSHM malloc.c -o $@
+
+db_malloc.o: malloc.c malloc.h
+	$(CC) -c $(CFLAGS) -DMALLOC_DEBUG malloc.c -o $@
+
+db_shm_malloc.o: malloc.c malloc.h
+	$(CC) -c $(CFLAGS) -DMALLOC_DEBUG -DSHM malloc.c -o $@
+
+tar: $(SRCS)
+	-rm -f malloc.tar malloc.tar.gz
+	tar cvf malloc.tar $(SRCS)
+	gzip -9 malloc.tar
+
+clean:
+	-rm *.o *.a tshm1 tshm1db tshm2 tshm2db tanon
diff --git a/src/shm_malloc/malloc.c b/src/shm_malloc/malloc.c
new file mode 100644
index 0000000..5f25f6d
--- /dev/null
+++ b/src/shm_malloc/malloc.c
@@ -0,0 +1,1589 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifdef SHM
+#define malloc shm_malloc
+#define realloc shm_realloc
+#define free shm_free
+#define calloc shm_calloc
+#define malloc_small shm_malloc_small
+#define valloc shm_valloc
+#define sbrk shm_sbrk
+#define brk shm_brk
+#define minit abort
+#define mresize shm_mresize
+#define msize shm_msize
+#define heapdump shm_heapdump
+#else
+#ifdef INDIRECT
+#define malloc _malloc
+#define realloc _realloc
+#define free _free
+#define calloc _calloc
+#define valloc _valloc
+#define mresize _mresize
+#define msize _msize
+#define malloc_small _malloc_small
+#define heapdump _heapdump
+#endif
+extern void *sbrk(intptr_t);
+extern int brk(void *);
+#endif
+
+#define _S(x) #x
+#define S(x) _S(x)
+
+/* these are the different locking schemes.  The numbers associated with
+** them are unimportant; they need only be different */
+#define SYSVSEM 1  /* SysV Semaphores */
+#define FLOCK 2    /* File Locks */
+#define SPINLOCK 3 /* atomic test-and-set spinlocks */
+#define PMUTEX 4   /* pthreads mutexes */
+
+#if defined(SHM) || defined(_REENTRANT) || defined(_POSIX_THREADS)
+#ifndef LOCKTYPE
+#if defined(_POSIX_THREADS) /* && !defined(__CYGWIN__) */
+#define LOCKTYPE PMUTEX
+#elif defined(__GNUC__) &&                                                     \
+    (defined(mc68000) || defined(sparc) || defined(m88k) ||                    \
+     defined(__alpha__) || defined(__ppc__) || defined(__i386__))
+#define LOCKTYPE SPINLOCK
+#else
+#define LOCKTYPE FLOCK
+#endif
+#endif
+
+#else /* !SHM && !_REENTRANT && !_POSIX_THREADS */
+
+#undef LOCKTYPE
+
+#endif /* SHM || _REENTRANT || _POSIX_THREADS */
+
+#if defined(__CYGWIN__)
+/* don't try to use thread-local vars on cygwin */
+#define thread_local
+#elif defined(__STDC__) && __STDC_VERSION__ >= 199901L
+#define thread_local __thread
+#elif defined(__GNUC__) && __GNUC__ >= 4
+#define thread_local __thread
+#else
+#define thread_local
+#endif
+
+#include "malloc.h"
+
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#undef SMLIST
+static inline int SMLIST(int sz) {
+  int rv;
+  asm("bsr %1,%0" : "=r"(rv) : "r"((sz - 1) | 1));
+  rv -= 7 - size256;
+  if (rv < 0)
+    rv = 0;
+  if (rv >= NUMSMALL)
+    rv = -1;
+  return rv;
+}
+#endif
+
+#ifdef SHM
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/mman.h>
+
+struct basepage *const membase =
+#if __SIZEOF_POINTER__ == 8
+#define membase ((struct basepage *)0x1000000000L)
+#else
+#if defined(sun)
+#define membase ((struct basepage *)0xe0000000)
+#elif defined(sgi)
+#define membase ((struct basepage *)0x08000000)
+#elif defined(_AIX)
+#define membase ((struct basepage *)0x40000000)
+#elif defined(__FreeBSD__)
+#define membase ((struct basepage *)0x10000000)
+#elif defined(__hpux__)
+#define membase ((struct basepage *)0xa0000000)
+#elif defined(__CYGWIN__)
+#define membase ((struct basepage *)0x40000000)
+#else
+#define membase ((struct basepage *)0x80000000)
+#endif
+#endif
+    membase;
+
+/* minimum number of additional pages to mmap when expanding the heap */
+#define MMAP_INCR 16
+
+static void *localbrk;
+static int mfd;
+
+#else /* !SHM */
+
+static struct basepage *membase;
+
+#endif /* SHM */
+
+#if defined(SHM) || defined(_REENTRANT) || defined(_POSIX_THREADS)
+#if LOCKTYPE == SYSVSEM
+#if defined(_AIX) || defined(__osf__)
+/* AIX and OSF/1 have eliminated union semun, but are otherwise compatable */
+union semun {
+  int val;
+  struct semid_ds *buf;
+  ushort *array;
+};
+#endif /* _AIX || __osf__ */
+
+static int semid;
+static struct sembuf sembuf;
+
+#define FIRSTKEY 1 /* first semaphore key to try */
+static int lock_init(int init) {
+  if (init) {
+    int um = umask(0);
+    umask(um);
+    um = ~um & 0777;
+    membase->semkey = FIRSTKEY;
+    while ((semid = semget(membase->semkey, NUMSMALL + 1,
+                           IPC_CREAT | IPC_EXCL | um)) < 0 &&
+           errno == EEXIST)
+      membase->semkey++;
+    if (semid >= 0) {
+      ushort arr[NUMSMALL + 1];
+      int i;
+      union semun semu;
+      semu.array = arr;
+      for (i = 0; i <= NUMSMALL; i++)
+        arr[i] = 1;
+      if (semctl(semid, 0, SETALL, semu) < 0) {
+        semctl(semid, 0, IPC_RMID, semu);
+        return -1;
+      }
+    }
+  } else
+    semid = semget(membase->semkey, 0, 0);
+  return (semid < 0) ? -1 : 0;
+}
+#define LOCK(q)                                                                \
+  do {                                                                         \
+    sembuf.sem_num = q < NUMSMALL ? q : NUMSMALL;                              \
+    sembuf.sem_op = -1;                                                        \
+    sembuf.sem_flg = 0;                                                        \
+    while (semop(semid, &sembuf, 1) < 0)                                       \
+      assert(errno == EINTR);                                                  \
+  } while (0)
+#define UNLOCK(q)                                                              \
+  do {                                                                         \
+    sembuf.sem_num = q < NUMSMALL ? q : NUMSMALL;                              \
+    sembuf.sem_op = 1;                                                         \
+    sembuf.sem_flg = 0;                                                        \
+    while (semop(semid, &sembuf, 1) < 0)                                       \
+      assert(errno == EINTR);                                                  \
+  } while (0)
+#define LOCK_FINI
+#define LOCK_DESTROY                                                           \
+  do {                                                                         \
+    union semun semu;                                                          \
+    semu.val = 0;                                                              \
+    semctl(semid, 0, IPC_RMID, semu);                                          \
+  } while (0)
+#endif /* SYSVSEM */
+
+#if LOCKTYPE == FLOCK
+static int lfd;
+static struct flock lock;
+
+static int lock_init(int init) {
+  char lfile[1024];
+
+  strcpy(lfile, membase->mfile);
+  strcat(lfile, ".lock");
+  lock.l_whence = SEEK_SET;
+  lock.l_len = 1;
+  if (!init) {
+    if ((lfd = open(lfile, O_RDWR, 0)) < 0)
+      return -1;
+  } else if ((lfd = open(lfile, O_RDWR | O_CREAT, 0666)) < 0)
+    return -1;
+  else
+    ftruncate(lfd, lfile, NUMSMALL + 1);
+  fcntl(lfd, F_SETFD, FD_CLOEXEC);
+  return 0;
+}
+#define LOCK(q)                                                                \
+  do {                                                                         \
+    lock.l_type = F_WRLCK;                                                     \
+    lock.l_start = (q);                                                        \
+    while (fcntl(lfd, F_SETLKW, &lock) < 0)                                    \
+      assert(errno == EINTR);                                                  \
+  } while (0)
+#define UNLOCK(q)                                                              \
+  do {                                                                         \
+    lock.l_type = F_UNLCK;                                                     \
+    lock.l_start = (q);                                                        \
+    while (fcntl(lfd, F_SETLK, &lock) < 0)                                     \
+      assert(errno == EINTR);                                                  \
+  } while (0)
+#define LOCK_FINI close(lfd)
+#define LOCK_DESTROY                                                           \
+  do {                                                                         \
+    char lfile[1024];                                                          \
+    strcpy(lfile, membase->mfile);                                             \
+    strcat(lfile, ".lock");                                                    \
+    unlink(lfile);                                                             \
+    close(lfd);                                                                \
+  } while (0)
+#endif /* FLOCK */
+
+#if LOCKTYPE == SPINLOCK
+#include <sys/time.h>
+static int lock_init(int init) {
+  if (init) {
+    int i;
+    for (i = NUMSMALL; i >= 0; i--)
+      membase->locks[i] = 0;
+  }
+  return 0;
+}
+#define LOCK(q)                                                                \
+  do {                                                                         \
+    volatile TAS_t *_l = &membase->locks[q];                                   \
+    int _try = 10;                                                             \
+    while (_try > 0 && (*_l || TAS(*_l)))                                      \
+      _try--;                                                                  \
+    if (!_try)                                                                 \
+      while (*_l || TAS(*_l)) {                                                \
+        struct timeval to = {0, 1000};                                         \
+        select(0, 0, 0, 0, &to);                                               \
+      }                                                                        \
+    MEMORY_BARRIER;                                                            \
+  } while (0)
+#define UNLOCK(q)                                                              \
+  do {                                                                         \
+    MEMORY_BARRIER;                                                            \
+    membase->locks[q] = 0;                                                     \
+  } while (0)
+#define LOCK_FINI
+#define LOCK_DESTROY
+#endif /* SPINLOCK */
+
+#if LOCKTYPE == PMUTEX
+static int lock_init(int init) {
+  if (init) {
+    int i;
+    for (i = NUMSMALL; i >= 0; i--)
+      pthread_mutex_init(&membase->locks[i], 0);
+  }
+  return 0;
+}
+
+#define LOCK(q) pthread_mutex_lock(&membase->locks[q])
+#define UNLOCK(q) pthread_mutex_unlock(&membase->locks[q])
+#define LOCK_FINI
+#define LOCK_DESTROY
+#endif /* PMUTEX */
+
+#else /* !SHM && !_REENTRANT && !_POSIX_THREADS */
+
+#define LOCK(q)
+#define UNLOCK(q)
+static int lock_init() { return 0; }
+
+#endif /* SHM || _REENTRANT || _POSIX_THREADS */
+
+typedef unsigned long U;
+
+#define TARGET(l) (2 << ((NUMSMALL - 1 - (l)) / 2))
+#define PAGENUM(p) (((U)(p) - (U)membase) / PAGESIZE)
+#define PAGEADDR(n) ((void *)((U)membase + (U)(n)*PAGESIZE))
+#define PAGEBASE(p) ((U)p & ~(PAGESIZE - 1))
+#define I2(pn) ((pn) % (PAGESIZE / sizeof(struct page)))
+#define I1(pn) ((pn) / (PAGESIZE / sizeof(struct page)))
+#define ADDR2PAGE(p) (&membase->pages[I1(PAGENUM(p))][I2(PAGENUM(p))])
+#define NUM2PAGE(n) (&membase->pages[I1(n)][I2(n)])
+#define VALID(p) (((U)(p) > (U)membase) && ((U)(p) < (U)membase->end))
+#define FREEPAGE(n) ((struct freepage *)PAGEADDR(n))
+
+#ifdef MALLOC_DEBUG
+static unsigned long lcrng(unsigned long s) {
+  unsigned long long mod = (1LL << 31) - 1;
+  unsigned long long t = s * 16807LL;
+
+  t = (t & mod) + (t >> 31);
+  if (t > mod)
+    t -= mod;
+  return t;
+}
+
+#define GUARD 0xa1962f8dU
+#define DB(code) code
+#else /* !MALLOC_DEBUG */
+#define DB(code)
+#endif /* MALLOC_DEBUG */
+
+static inline int pcmp(unsigned _a, unsigned _b) {
+  struct page *a = NUM2PAGE(_a), *b = NUM2PAGE(_b);
+  int v;
+
+  v = a->count - b->count;
+  return v ? v : (long)_a - (long)_b;
+}
+
+#if 0
+/*
+ * FIXME -- profile this sorter and maybe choose a better one?
+ * FIXME -- this pivot choice is pessimal for a reversed list, but very
+ * FIXME -- good (O(n)) for almost sorted lists, which should be our
+ * FIXME -- common case.  Probably not a big deal as the lists should
+ * FIXME -- rarely be big
+ * FIXME -- we're also assuming the optimizer will do a good job CSEing
+ * FIXME -- these NUM2PAGE macros after inlining pcmp
+ *
+ * This algorithm has very bad behavior with a list that is sorted except
+ * for the last element, which turns out to be a somewhat common case here.
+ *
+ * Quicksort with last-sorted pivot
+ */
+unsigned page_list_sort(unsigned p, unsigned **tail)
+{
+unsigned a, *a_tail, b, *b_tail, pivot;
+
+    if (!p) return p;
+    pivot = a = p;
+    a_tail = &a;
+    p = NUM2PAGE(p)->page;
+    while (p) {
+	if (pcmp(pivot, p) > 0) break;
+	a_tail = &NUM2PAGE(pivot)->page;
+	pivot = p;
+	p = NUM2PAGE(p)->page; }
+    if (!p) {
+	if (tail) *tail = &NUM2PAGE(pivot)->page;
+	return a; }
+    b_tail = &b;
+    while (p) {
+	if (pcmp(pivot, p) > 0) {
+	    *a_tail = p;
+	    a_tail = &NUM2PAGE(p)->page; }
+	else {
+	    *b_tail = p;
+	    b_tail = &NUM2PAGE(p)->page; }
+	p = NUM2PAGE(p)->page; }
+    *a_tail = 0;
+    *b_tail = 0;
+    if (a) a = page_list_sort(a, &a_tail);
+    if (b) b = page_list_sort(b, &b_tail);
+    *a_tail = pivot;
+    NUM2PAGE(pivot)->page = b;
+    if (tail)
+	*tail = b ? b_tail : &NUM2PAGE(pivot)->page;
+    return a;
+}
+#else
+/*
+ * FIXME -- profile this sorter and maybe choose a better one?
+ *
+ * simple split/merge sort */
+unsigned page_list_sort(unsigned p) {
+  unsigned a, b, *t;
+  int asort = 0;
+
+  if (!p)
+    return p;
+  a = b = p;
+  b = NUM2PAGE(b)->page;
+  if (!b)
+    return p;
+  while (b) {
+    if (!(b = NUM2PAGE(b)->page))
+      break;
+    unsigned l = a;
+    a = NUM2PAGE(a)->page;
+    if (!asort && pcmp(l, a) > 0)
+      asort = 1;
+    b = NUM2PAGE(b)->page;
+  }
+  b = page_list_sort(NUM2PAGE(a)->page);
+  NUM2PAGE(a)->page = 0;
+  a = p;
+  if (asort)
+    a = page_list_sort(a);
+  t = &p;
+  while (a && b) {
+    if (pcmp(a, b) <= 0) {
+      *t = a;
+      t = &NUM2PAGE(a)->page;
+      a = *t;
+    } else {
+      *t = b;
+      t = &NUM2PAGE(b)->page;
+      b = *t;
+    }
+  }
+  *t = a | b;
+  return p;
+}
+#define page_list_sort(p, t) page_list_sort(p)
+#endif
+
+static thread_local struct localfree freelists[NUMSMALL];
+#ifdef MALLOC_DEBUG
+static thread_local struct backup {
+  struct backup *next;
+  struct chunk *item;
+} * backupfree[NUMSMALL], *backupaux[NUMSMALL], *spare;
+
+static void tbackup(int i) {
+  struct chunk *p;
+  struct backup *q;
+
+  for (p = freelists[i].free, q = backupfree[i]; p && q;
+       p = p->next, q = q->next)
+    if (p != q->item) {
+      printf("***" S(free) " list for size %d corrupted\n", SIZE(i));
+      abort();
+    }
+  if (p || q) {
+    printf("***" S(free) " list for size %d corrupted\n", SIZE(i));
+    abort();
+  }
+  for (p = freelists[i].aux, q = backupaux[i]; p && q; p = p->next, q = q->next)
+    if (p != q->item) {
+      printf("***" S(free) " list for size %d corrupted\n", SIZE(i));
+      abort();
+    }
+  if (p || q) {
+    printf("***" S(free) " list for size %d corrupted\n", SIZE(i));
+    abort();
+  }
+}
+
+static struct backup *balloc() {
+  struct backup *p;
+  int i;
+
+  if (!spare) {
+    p = valloc(PAGESIZE);
+    i = PAGESIZE / sizeof(struct backup) - 1;
+    p[i].next = 0;
+    for (i--; i >= 0; i--)
+      p[i].next = &p[i + 1];
+    spare = p;
+  }
+  p = spare;
+  spare = p->next;
+  return p;
+}
+
+static void bfree(struct backup *p) {
+  p->next = spare;
+  spare = p;
+}
+
+static void *gcheck(void *_b) {
+  U *b = _b;
+  if (b) {
+    b -= 2;
+    if (b[0] != GUARD || b[1] != lcrng((U)b)) {
+      printf("***guard corrupted at %p\n", b);
+      abort();
+    }
+  }
+  return b;
+}
+
+static void *gsetup(void *_b) {
+  U *b = _b;
+  if (b) {
+    *b++ = GUARD;
+    *b++ = lcrng((U)_b);
+  }
+  return b;
+}
+#endif /* MALLOC_DEBUG */
+
+void *malloc(size_t size) {
+  int sc;
+  void *rv;
+
+  DB(size += 2 * sizeof(U);)
+  sc = SMLIST(size);
+  if (sc >= 0)
+    rv = malloc_small(sc);
+  else
+    rv = valloc(size);
+  DB(rv = gsetup(rv);)
+  return rv;
+}
+
+static void msetup() {
+  memset(membase, 0, 3 * PAGESIZE);
+  memcpy(membase->magic, "SHM ", 4);
+  membase->param[0] = 0;
+  membase->param[1] = sizeof(void *);
+#ifdef LOCKTYPE
+  membase->param[2] = LOCKTYPE;
+#endif
+#ifdef MALLOC_DEBUG
+  membase->param[2] |= 0x80;
+#endif
+  membase->param[3] = NUMSMALL;
+  membase->base = membase;
+  membase->pages = (struct page **)((U)membase + PAGESIZE);
+  membase->end = (void *)((U)membase + PAGESIZE * 3);
+  membase->pages[0] = (struct page *)((U)membase + PAGESIZE * 2);
+  membase->pages[0][0].code = BASE;
+  membase->pages[0][1].code = BASE;
+  membase->pages[0][2].code = BASE;
+}
+
+#ifdef SHM
+static int msetup_valid() {
+  if (memcmp(membase->magic, "SHM ", 4))
+    return 0;
+  if (membase->param[0] != 0)
+    return 0;
+  if (membase->param[1] != sizeof(void *))
+    return 0;
+  if ((membase->param[2] & 0x7f) != LOCKTYPE)
+    return 0;
+#ifdef MALLOC_DEBUG
+  if (!(membase->param[2] & 0x80))
+    return 0;
+#else
+  if (membase->param[2] & 0x80)
+    return 0;
+#endif
+  if (membase->param[3] != NUMSMALL)
+    return 0;
+  if (membase->base != membase)
+    return 0;
+  return 1;
+}
+#endif
+
+#ifndef SHM
+static void minit() {
+  U p;
+
+  p = (U)sbrk(0);
+  if (p % PAGESIZE) {
+    sbrk(PAGESIZE - p % PAGESIZE);
+    p = (U)sbrk(0);
+  }
+  assert(p % PAGESIZE == 0);
+
+  membase = (struct basepage *)p;
+  sbrk(PAGESIZE * 3);
+  msetup();
+  lock_init(1);
+}
+
+#ifdef WINNT
+int brk(void *p) {
+  void *op = sbrk(0);
+
+  return (int)sbrk((int)p - (int)op);
+}
+#endif /* WINNT */
+
+#else /* SHM */
+
+#ifdef __CYGWIN__
+
+/* Cygwin's mmap can't deal with multiple partial mappings of a file, so
+ * in order to map more of our shared mem file, we need to unmap what we
+ * have mapped and then remap the whole thing as one chunk.  This doesn't
+ * work for anonymous mapping (we'd lose what was previously mapped), so
+ * we always do them in multiples of 64K which seems to work out ok */
+
+static void *cygwin_mmap(void *addr, size_t length, int prot, int flags, int fd,
+                         off_t offset) {
+  if (fd >= 0 && addr != membase) {
+    munmap(membase, offset);
+    length += offset;
+    addr = membase;
+    offset = 0;
+  } else if (fd < 0) {
+    /* how much was already mapped by a previous mmap */
+    size_t done = -(intptr_t)addr & 0xffff;
+    if (length <= done)
+      return addr;
+    addr = (char *)addr + done;
+    length -= done;
+    /* round up to 64K */
+    length |= 0xffff;
+    length++;
+    offset = 0; /* should be ignored by mmap */
+  }
+  return mmap(addr, length, prot, flags, fd, offset);
+}
+
+#define mmap cygwin_mmap
+
+#endif /* __CYGWIN__ */
+
+static struct sigaction oldsegv;
+
+static void shm_segv() {
+  void *newbrk;
+  int flags = MAP_SHARED | MAP_FIXED;
+
+  /* if a SEGV occurred and there's new memory to be mapped, map it
+  ** and retry */
+  if (mfd < 0)
+    flags |= MAP_ANONYMOUS;
+  newbrk = membase->eof;
+  if (newbrk > localbrk) {
+    if (mfd >= 0)
+      lseek(mfd, 0, SEEK_SET);
+    mmap(localbrk, newbrk - localbrk, PROT_READ | PROT_WRITE, flags, mfd,
+         localbrk - (void *)membase);
+    localbrk = newbrk;
+  } else {
+    /* no more to map, must be a real SEGV */
+    sigaction(SIGSEGV, &oldsegv, 0);
+  }
+}
+
+int shm_destroy() {
+  LOCK_DESTROY;
+  unlink(membase->mfile);
+  munmap(membase, localbrk - (void *)membase);
+  close(mfd);
+  return 0;
+}
+
+int shm_init(const char *mfile, void (*init_fn)()) {
+  int tmp, wait = 5;
+  int flags = MAP_SHARED | MAP_FIXED;
+  struct sigaction segv;
+
+  mfd = -1;
+  if (!mfile)
+    flags |= MAP_ANONYMOUS;
+  while (mfd == -1) {
+    if (mfile && (mfd = open(mfile, O_RDWR, 0)) >= 0) {
+      /* make sure the file isn't empty */
+      while (read(mfd, &tmp, sizeof(tmp)) == 0) {
+        if (wait-- < 0) {
+          close(mfd);
+          errno = EINVAL;
+          return -1;
+        }
+        sleep(1);
+      }
+      lseek(mfd, 0, SEEK_SET);
+      if ((long)mmap(membase, PAGESIZE, PROT_READ | PROT_WRITE, flags, mfd,
+                     0) == -1) {
+        close(mfd);
+        return -1;
+      }
+      /* wait until initialization is complete */
+      while (!membase->init && wait-- > 0)
+        sleep(1);
+      if (!membase->init || !msetup_valid()) {
+        munmap(membase, PAGESIZE);
+        close(mfd);
+        errno = EINVAL;
+        return -1;
+      }
+      localbrk = membase->eof;
+      lseek(mfd, 0, SEEK_SET);
+      if ((long)mmap((void *)membase + PAGESIZE,
+                     (localbrk - (void *)membase) - PAGESIZE,
+                     PROT_READ | PROT_WRITE, flags, mfd, PAGESIZE) == -1) {
+        close(mfd);
+        return -1;
+      }
+    } else if (!mfile ||
+               (errno == ENOENT &&
+                (mfd = open(mfile, O_RDWR | O_CREAT | O_EXCL, 0666)) >= 0)) {
+      if (mfd >= 0) {
+        if (ftruncate(mfd, 3 * PAGESIZE) < 0) {
+          close(mfd);
+          return -1;
+        }
+        lseek(mfd, 0, SEEK_SET);
+      }
+      if ((long)mmap(membase, 3 * PAGESIZE, PROT_READ | PROT_WRITE, flags, mfd,
+                     0) == -1) {
+        close(mfd);
+        return -1;
+      }
+      msetup();
+      localbrk = membase->brk = membase->eof = membase->end;
+      strcpy(membase->mfile, mfile ? mfile : "");
+      membase->global = 0;
+      break;
+    } else if (errno != EEXIST)
+      return -1;
+  }
+  fcntl(mfd, F_SETFD, FD_CLOEXEC);
+  if (lock_init(!membase->init) < 0) {
+    munmap(membase, localbrk - (void *)membase);
+    close(mfd);
+    return -1;
+  }
+  if (!membase->init && init_fn)
+    init_fn();
+  segv.sa_flags = 0;
+  sigemptyset(&segv.sa_mask);
+  segv.sa_handler = shm_segv;
+  sigaction(SIGSEGV, &segv, &oldsegv);
+  membase->init = 1;
+  return 0;
+}
+
+static void flush_to_global_freelist(int, struct chunk *, struct chunk *);
+
+/* flush out local free lists, so we can exit leaving memory consistent */
+int shm_fini() {
+  int l;
+
+  for (l = 0; l < NUMSMALL; l++) {
+    if (freelists[l].aux || freelists[l].free)
+      flush_to_global_freelist(l, freelists[l].free, freelists[l].aux);
+    freelists[l].aux = freelists[l].free = 0;
+    freelists[l].count = 0;
+  }
+  // munmap(membase, localbrk - (void *)membase);
+  // close(mfd);
+  LOCK_FINI;
+  return 0;
+}
+
+/* clear all the free lists, as they really belong to the parent */
+int shm_child() {
+  int l;
+
+  for (l = 0; l < NUMSMALL; l++)
+    freelists[l].aux = freelists[l].free = 0;
+  return 0;
+}
+
+static int shm_brk(void *newbrk) {
+  char tmp = 0;
+  int flags = MAP_SHARED | MAP_FIXED;
+
+  if (mfd < 0)
+    flags |= MAP_ANONYMOUS;
+  if (newbrk <= membase->brk) {
+    if (ftruncate(mfd, newbrk - (void *)membase) < 0)
+      return -1;
+    if (newbrk < membase->eof)
+      munmap(newbrk, membase->eof - newbrk);
+    membase->brk = membase->eof = localbrk = newbrk;
+  } else if (newbrk <= membase->eof) {
+    membase->brk = newbrk;
+    if (newbrk > localbrk) {
+      if (mfd >= 0)
+        lseek(mfd, 0, SEEK_SET);
+      if ((long)mmap(localbrk, membase->eof - localbrk, PROT_READ | PROT_WRITE,
+                     flags, mfd, localbrk - (void *)membase) == -1)
+        return -1;
+      localbrk = membase->eof;
+    }
+  } else {
+    void *neweof = membase->brk + PAGESIZE * MMAP_INCR;
+    if (newbrk > neweof)
+      neweof = newbrk;
+    if (mfd >= 0) {
+      if (ftruncate(mfd, neweof - (void *)membase) < 0)
+        return -1;
+      if (lseek(mfd, neweof - (void *)membase - 1, SEEK_SET) < 0)
+        return -1;
+      if (write(mfd, &tmp, 1) != 1)
+        return -1;
+      lseek(mfd, 0, SEEK_SET);
+    }
+    membase->eof = neweof;
+    if ((long)mmap(localbrk, neweof - localbrk, PROT_READ | PROT_WRITE, flags,
+                   mfd, localbrk - (void *)membase) == -1)
+      return -1;
+    localbrk = neweof;
+    membase->brk = newbrk;
+  }
+  return 0;
+}
+
+static void *shm_sbrk(int delta) {
+  void *oldbrk = membase->brk;
+
+  return shm_brk(membase->brk + delta) < 0 ? (void *)-1 : oldbrk;
+}
+
+void *shm_global() { return membase->global; }
+
+void shm_set_global(void *v) { membase->global = v; }
+
+#endif /* SHM */
+
+/*
+** The free page list contains all the entirely free pages.  It is organized
+** as a `list of lists' with blocks of the same size in the same list.
+** the lists are sorted order of size (smallest first), and each list is
+** sorted in memory order (lowest address first)
+*/
+
+#ifdef MALLOC_DEBUG
+/* check the global freepage lists to ensure consistency, and ensure that
+ * 'p' is present (free) on there */
+static void fp_verify(struct freepage *p) {
+  struct freepage *t1, *t2;
+  struct page *pp;
+  int i;
+
+  if (membase->freepages &&
+      (!VALID(membase->freepages) ||
+       membase->freepages->parent != &membase->freepages)) {
+    printf("***" S(free) "list corrupt (base table)\n");
+    abort();
+  }
+  for (t1 = membase->freepages; t1; t1 = t1->bigger) {
+    if (t1->bigger && (!VALID(t1->bigger) || t1->size >= t1->bigger->size ||
+                       t1->bigger->parent != &t1->bigger)) {
+      printf("***" S(free) "list corrupt (page %p ?)\n", t1);
+      abort();
+    }
+    for (t2 = t1; t2; t2 = t2->next) {
+      if (p == t2)
+        p = 0;
+      if (t2->next &&
+          (!VALID(t2->next) || t2->next->bigger || t2->size != t2->next->size ||
+           t2->next->parent != &t2->next)) {
+        printf("***" S(free) "list corrupt (page %p ?)\n", t2);
+        abort();
+      }
+      pp = ADDR2PAGE(t2);
+      if (pp->code != BIG + FREE ||
+          PAGEADDR(pp->page - t2->size) != (void *)t2) {
+        printf("***page tables corrupt (page %p)\n", t2);
+        abort();
+      }
+      for (i = 1; i < t2->size; i++) {
+        struct page *ip = NUM2PAGE(PAGENUM(t2) + i);
+        if (ip->code != MIDDLE || PAGEADDR(ip->page) != (void *)t2) {
+          printf("***page tables corrupt (page %p)\n",
+                 (char *)t2 + i * PAGESIZE);
+          abort();
+        }
+      }
+    }
+  }
+  if (p) {
+    printf("***apparently free page %p not on " S(free) "list\n", p);
+    abort();
+  }
+}
+#else /* !MALLOC_DEBUG */
+#define fp_verify(p)
+#endif /* MALLOC_DEBUG */
+
+static void fp_remove(struct freepage *p) {
+  fp_verify(p);
+  if (p->next) {
+    (*p->parent) = p->next;
+    p->next->parent = p->parent;
+    if ((p->next->bigger = p->bigger))
+      p->bigger->parent = &p->next->bigger;
+  } else {
+    if (((*p->parent) = p->bigger))
+      p->bigger->parent = p->parent;
+  }
+}
+
+static void fp_add(struct freepage *p) {
+  struct freepage **t = &membase->freepages;
+
+  fp_verify(0);
+  while (*t && (*t)->size < p->size)
+    t = &(*t)->bigger;
+  if (*t && (*t)->size == p->size) {
+    while (*t && (U)*t < (U)p)
+      t = &(*t)->next;
+    if ((p->next = (*t))) {
+      if ((p->bigger = p->next->bigger)) {
+        p->bigger->parent = &p->bigger;
+        p->next->bigger = 0;
+      }
+      p->next->parent = &p->next;
+    } else
+      p->bigger = 0;
+  } else {
+    p->next = 0;
+    if ((p->bigger = (*t)))
+      p->bigger->parent = &p->bigger;
+  }
+  *t = p;
+  p->parent = t;
+}
+
+static struct freepage *fp_find(U size) {
+  struct freepage *t;
+
+  fp_verify(0);
+  for (t = membase->freepages; t && t->size < (int)size; t = t->bigger)
+    ;
+  if (t)
+    fp_remove(t);
+  return t;
+}
+
+void *malloc_small(int l) {
+  struct chunk *new;
+
+  if (!membase)
+    minit();
+  assert(l >= SMLIST(sizeof(void *)) && l < NUMSMALL);
+  DB(tbackup(l));
+  if (!freelists[l].free) {
+    if (freelists[l].aux) {
+      freelists[l].free = freelists[l].aux;
+      freelists[l].aux = 0;
+      DB(backupfree[l] = backupaux[l]);
+      DB(backupaux[l] = 0);
+    } else {
+      int i;
+      struct chunk *new_fl = 0;
+      if (!freelists[l].target)
+        freelists[l].target = TARGET(l);
+      LOCK(l);
+      for (i = freelists[l].target; i; i--) {
+        unsigned pn;
+        struct page *p;
+        if (!(pn = membase->freechunks[l])) {
+          int j;
+          if (!(new = valloc(PAGESIZE))) {
+            UNLOCK(l);
+            return 0;
+          }
+          pn = PAGENUM(new);
+          p = ADDR2PAGE(new);
+          p->code = l;
+          p->count = j = PERPAGE(l);
+          p->free = 0;
+          p->page = 0;
+          while (--j) {
+            struct chunk *prev = new;
+            new = (struct chunk *)((U) new + SIZE(l));
+            prev->next = new;
+          }
+          new->next = 0;
+          membase->freechunks[l] = pn;
+        }
+        p = NUM2PAGE(pn);
+        new = (struct chunk *)((U)PAGEADDR(pn) + p->free * SIZE(l));
+        if (new->next)
+          p->free = ((U) new->next - (U)PAGEADDR(pn)) / SIZE(l);
+        else {
+          p->free = 0;
+          assert(p->count == 1);
+        }
+        if (!--p->count) {
+          assert(p->free == 0);
+          membase->freechunks[l] = p->page;
+        }
+        new->next = new_fl;
+        new_fl = new;
+      }
+      freelists[l].free = new_fl;
+      DB({
+        struct chunk *p;
+        struct backup **q;
+        for (p = new_fl, q = &backupfree[l]; p; p = p->next, q = &(*q)->next) {
+          *q = balloc();
+          (*q)->item = p;
+        }
+        *q = 0;
+      });
+      UNLOCK(l);
+    }
+    freelists[l].count = freelists[l].target;
+  }
+  new = freelists[l].free;
+  freelists[l].free = new->next;
+  DB({
+    struct backup *tmp = backupfree[l];
+    backupfree[l] = tmp->next;
+    bfree(tmp);
+  });
+  freelists[l].count--;
+  DB(tbackup(l));
+  return (void *)new;
+}
+
+/* allocate 'size' pages without expanding the heap.
+ * Return 0 if that's not possible
+ * must hold LOCK(NUMSMALL) before calling */
+static void *alloc_pages(int size) {
+  unsigned i;
+  void *p = fp_find(size);
+  if (p) {
+    unsigned pn = PAGENUM(p);
+    struct page *pg = NUM2PAGE(pn);
+    pg->code = BIG;
+    if (pg->page - pn > (int)size) {
+      unsigned extra = pn + size;
+      struct page *extrapg = NUM2PAGE(extra);
+      extrapg->code = BIG + FREE;
+      extrapg->page = pg->page;
+      FREEPAGE(extra)->size = i = pg->page - extra;
+      while (--i)
+        NUM2PAGE(extra + i)->page = extra;
+      fp_add(FREEPAGE(extra));
+    }
+  }
+  return p;
+}
+
+/* Free a block of one or more pages, without shrinking the heap.
+ * Coalesce with adjacent free block and return the resulting (possibly
+ * larger) free block
+ * Must hold LOCK(NUMSMALL) before calling. */
+static struct freepage *free_pages(void *p) {
+  unsigned i, adj;
+  struct page *pg = ADDR2PAGE(p), *adjpg;
+  struct freepage *fpage = p;
+  assert(pg->code == BIG);
+  pg->code = BIG + FREE;
+  adj = PAGENUM(p) - 1;
+  adjpg = NUM2PAGE(adj);
+  if (adjpg->code == MIDDLE) {
+    adj = adjpg->page;
+    adjpg = NUM2PAGE(adj);
+  }
+  if (adjpg->code == BIG + FREE) {
+    fpage = FREEPAGE(adj);
+    fp_remove(fpage);
+    adjpg->page = pg->page;
+    pg->code = MIDDLE;
+    for (i = PAGENUM(p); i < adjpg->page; i++)
+      NUM2PAGE(i)->page = adj;
+    pg = adjpg;
+    fpage->size = adjpg->page - adj;
+  } else
+    fpage->size = pg->page - PAGENUM(p);
+  if (PAGEADDR(pg->page) < membase->end) {
+    adj = pg->page;
+    adjpg = NUM2PAGE(adj);
+    if (adjpg->code == BIG + FREE) {
+      fp_remove(FREEPAGE(adj));
+      adjpg->code = MIDDLE;
+      pg->page = adjpg->page;
+      for (i = adj; i < pg->page; i++)
+        NUM2PAGE(i)->page = PAGENUM(fpage);
+      fpage->size = pg->page - PAGENUM(fpage);
+    }
+  }
+  fp_add(fpage);
+  return fpage;
+}
+
+/* Initialize the page descriptors for an extent of memory that is in use.
+ * Must hold LOCK(NUMSMALL) before calling. */
+static void setup_extent_descriptor(void *p, int size) {
+  struct page *pg = ADDR2PAGE(p);
+  pg->page = PAGENUM(p) + size;
+  pg->code = BIG;
+  while (--size > 0) {
+    pg = NUM2PAGE(PAGENUM(p) + size);
+    pg->page = PAGENUM(p);
+    pg->code = MIDDLE;
+  }
+}
+
+/* Expand the master page descriptor table to contain descriptors for pages.
+ * up to "to".  Must hold LOCK(NUMSMALL) before calling. */
+static int expand_page_table(unsigned to) {
+  int i, added_pages = 0, newmastersize = 0;
+  unsigned old = PAGENUM(membase->end) - 1;
+  void *oldmaster = 0;
+  if (PAGENUM(&membase->pages[I1(to)]) != PAGENUM(&membase->pages[I1(old)])) {
+    /* FIXME -- there's a race condition here when we resize the top-level
+     * FIXME -- pages table, as everyone accesses it without aquiring a
+     * FIXME -- lock.  So we ensure that the old top-level table remains
+     * FIXME -- valid for awhile after being reallocated.  That way. if
+     * FIXME -- someone is in the middle of accessing it, they'll still
+     * FIXME -- get the right info. as long as they're not delayed */
+    struct page **master;
+    int oldmastersize = I1(old) / (PAGESIZE / sizeof(struct page *)) + 1;
+    newmastersize = I1(to) / (PAGESIZE / sizeof(struct page *)) + 1;
+    if (!(master = alloc_pages(newmastersize))) {
+      to += newmastersize;
+      newmastersize = I1(to) / (PAGESIZE / sizeof(struct page *)) + 1;
+      if ((master = sbrk(newmastersize * PAGESIZE)) == (void *)-1)
+        return 0;
+      membase->end = (void *)((U)master + newmastersize * PAGESIZE);
+    }
+    memcpy(master, membase->pages, oldmastersize * PAGESIZE);
+    memset((void *)((U)master + oldmastersize * PAGESIZE), 0,
+           (newmastersize - oldmastersize) * PAGESIZE);
+    void *oldmaster = membase->pages;
+    membase->pages = master;
+    /* remark the old master as a generic extent, so we can free it */
+    setup_extent_descriptor(oldmaster, oldmastersize);
+  }
+  for (i = I1(old) + 1; i <= I1(to); i++) {
+    assert(membase->pages[i] == 0);
+    if ((membase->pages[i] = alloc_pages(1))) {
+      ADDR2PAGE(membase->pages[i])->code = BASE;
+    } else {
+      if ((membase->pages[i] = sbrk(PAGESIZE)) == (void *)-1) {
+        if (oldmaster)
+          free_pages(oldmaster);
+        return 0;
+      }
+      added_pages++;
+    }
+    memset(membase->pages[i], 0, PAGESIZE);
+  }
+  membase->end = PAGEADDR(to + 1);
+  if (newmastersize)
+    for (i = 0; i < newmastersize; i++)
+      NUM2PAGE(PAGENUM(membase->pages) + i)->code = BASE;
+  if (added_pages) {
+    if (!expand_page_table(to + added_pages)) {
+      if (oldmaster)
+        free_pages(oldmaster);
+      return 0;
+    }
+    for (i = I1(to); added_pages; i--, added_pages--)
+      ADDR2PAGE(membase->pages[i])->code = BASE;
+  }
+  if (oldmaster)
+    free_pages(oldmaster);
+  return 1;
+}
+
+void *valloc(size_t size) {
+  void *new;
+
+  size = (size + PAGESIZE - 1) / PAGESIZE; /* size in pages */
+  if (!membase)
+    minit();
+  LOCK(NUMSMALL);
+  if (!(new = alloc_pages(size))) {
+    if ((new = sbrk(size * PAGESIZE)) == (void *)-1) {
+      UNLOCK(NUMSMALL);
+      return 0;
+    }
+    if ((U) new % PAGESIZE) {
+      if (sbrk(PAGESIZE - (U) new % PAGESIZE) == (void *)-1) {
+        if (brk(new)) /* ignore return value */
+          ;
+        UNLOCK(NUMSMALL);
+        return 0;
+      }
+      new += PAGESIZE - (U) new % PAGESIZE;
+    }
+    if (I1(PAGENUM(new) + size - 1) != I1(PAGENUM(membase->end) - 1)) {
+      if (!expand_page_table(PAGENUM(new) + size - 1)) {
+        if ((U) new > (U)membase->end) {
+          if (brk(new)) /* ignore return value */
+            ;
+          UNLOCK(NUMSMALL);
+          return 0;
+        }
+      }
+    } else
+      membase->end = new + size *PAGESIZE;
+    setup_extent_descriptor(new, size);
+  }
+  UNLOCK(NUMSMALL);
+  return new;
+}
+
+static void flush_to_global_freelist(int l, struct chunk *cp,
+                                     struct chunk *cp2) {
+  struct chunk *tmp;
+  struct page *p;
+
+  LOCK(l);
+  if (!cp) {
+    cp = cp2;
+    cp2 = 0;
+  }
+  for (; cp; cp = tmp) {
+    if (!(tmp = cp->next)) {
+      tmp = cp2;
+      cp2 = 0;
+    }
+    p = ADDR2PAGE(cp);
+    cp->next = (void *)(p->count ? PAGEBASE(cp) + p->free * SIZE(l) : 0);
+    p->free = ((U)cp - PAGEBASE(cp)) / SIZE(l);
+    if (!p->count) {
+      p->page = membase->freechunks[l];
+      membase->freechunks[l] = PAGENUM(cp);
+    }
+    if (++p->count >= PERPAGE(l)) {
+      p->count = -1;
+    }
+  }
+  membase->freechunks[l] = page_list_sort(membase->freechunks[l], 0);
+  while (membase->freechunks[l] &&
+         (p = NUM2PAGE(membase->freechunks[l]))->count < 0) {
+    unsigned pn = membase->freechunks[l];
+    void *pp = PAGEADDR(pn);
+    membase->freechunks[l] = p->page;
+    p->code = BIG;
+    p->count = 0;
+    p->free = 0;
+    p->page = pn + 1;
+    DB(pp = gsetup(pp);)
+    free(pp);
+  }
+  UNLOCK(l);
+}
+
+void free(void *_old) {
+  struct chunk *old = _old;
+  struct page *p;
+  int l;
+#ifdef MALLOC_DEBUG
+  struct chunk *t, *last;
+  int i;
+#endif /* MALLOC_DEBUG */
+
+  if (!old)
+    return;
+#ifdef MALLOC_DEBUG
+  if ((U)old < (U)membase || (U)old >= (U)membase->end) {
+    printf("***Invalid pointer given to " S(free) " %p\n", old);
+    abort();
+  }
+#endif /* MALLOC_DEBUG */
+  DB(old = gcheck(old);)
+  p = ADDR2PAGE(old);
+  if ((l = p->code) < NUMSMALL) {
+#ifdef MALLOC_DEBUG
+    if (((U)old & (SIZE(l) - 1)) != 0) {
+      printf("***Invalid pointer given to " S(free) " %p\n", old);
+      abort();
+    }
+    for (last = 0, t = freelists[l].free, i = 0; t;
+         last = t, t = t->next, i++) {
+      if (t == old) {
+        printf("***double " S(free) " of %p\n", old);
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+      if (!VALID(t) || ADDR2PAGE(t)->code != l) {
+        printf("***" S(free) "list corrupt (freelist %d)\n", l);
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+      if (i > freelists[l].count) {
+        printf("***" S(free) "list corrupt (freelist %d)\n", l);
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+    }
+    if (i != freelists[l].count) {
+      printf("***" S(free) "list corrupt (freelist %d)\n", l);
+      if (last)
+        printf("   (block at %p ?)\n", last);
+      abort();
+    }
+    for (last = 0, t = freelists[l].aux, i = 0; t; last = t, t = t->next, i++) {
+      if (t == old) {
+        printf("***double " S(free) " of %p\n", old);
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+      if (!VALID(t) || ADDR2PAGE(t)->code != l) {
+        printf("***" S(free) "list corrupt (auxlist %d)\n", l);
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+      if (i > freelists[l].target) {
+        printf("***" S(free) "list corrupt (auxlist %d)\n", l);
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+    }
+    if (i && i != freelists[l].target) {
+      printf("***" S(free) "list corrupt (auxlist %d)\n", l);
+      if (last)
+        printf("   (block at %p ?)\n", last);
+      abort();
+    }
+    if (p->count) {
+      t = (void *)(PAGEBASE(old) + p->free * SIZE(l));
+      for (last = 0, i = 0; t; last = t, t = t->next, i++) {
+        if (t == old) {
+          printf("***double " S(free) " of %p\n", old);
+          if (last)
+            printf("   (block at %p ?)\n", last);
+          abort();
+        }
+        if ((U)t / PAGESIZE != (U)old / PAGESIZE) {
+          printf("***" S(free) "list corrupt (page %p)\n",
+                 (void *)((U)old & ~(PAGESIZE - 1)));
+          if (last)
+            printf("   (block at %p ?)\n", last);
+          abort();
+        }
+        if (i > p->count) {
+          printf("***" S(free) "list corrupt (page %p)\n",
+                 (void *)((U)old & ~(PAGESIZE - 1)));
+          if (last)
+            printf("   (block at %p ?)\n", last);
+          abort();
+        }
+      }
+      if (i != p->count) {
+        printf("***" S(free) "list corrupt (page %p)\n",
+               (void *)((U)old & ~(PAGESIZE - 1)));
+        if (last)
+          printf("   (block at %p ?)\n", last);
+        abort();
+      }
+    }
+#endif /* MALLOC_DEBUG */
+    DB(tbackup(l));
+    if (freelists[l].count == freelists[l].target) {
+      if (freelists[l].aux) {
+        struct chunk *tmp = freelists[l].aux;
+        freelists[l].aux = 0;
+        flush_to_global_freelist(l, tmp, 0);
+        DB({
+          struct backup *p;
+          for (p = backupaux[l]; p->next; p = p->next)
+            ;
+          p->next = spare;
+          spare = backupaux[l];
+          backupaux[l] = 0;
+        })
+      }
+      freelists[l].aux = freelists[l].free;
+      freelists[l].free = 0;
+      DB(backupaux[l] = backupfree[l]);
+      DB(backupfree[l] = 0);
+      freelists[l].count = 0;
+    }
+    old->next = freelists[l].free;
+    freelists[l].count++;
+    freelists[l].free = old;
+    DB({
+      struct backup *p = balloc();
+      p->next = backupfree[l];
+      p->item = old;
+      backupfree[l] = p;
+    })
+    DB(tbackup(l));
+  } else {
+    struct freepage *fpage;
+    assert(l == BIG);
+    assert(((U)old & (PAGESIZE - 1)) == 0);
+    LOCK(NUMSMALL);
+    fpage = free_pages(old);
+    if ((void *)((U)fpage + fpage->size * PAGESIZE) == membase->end &&
+        sbrk(0) == membase->end) {
+      fp_remove(fpage);
+      sbrk((U)fpage - (U)membase->end);
+      membase->end = fpage;
+    }
+    UNLOCK(NUMSMALL);
+  }
+}
+
+int mresize(void *old, U size) {
+  unsigned t, i;
+  struct page *op, *tpg;
+  int nl;
+  void *pp;
+
+  if (!old)
+    return 0;
+#ifdef MALLOC_DEBUG
+  if ((U)old < (U)membase || (U)old >= (U)membase->end) {
+    printf("***Invalid pointer given to " S(mresize) " %p\n", old);
+    abort();
+  }
+#endif /* MALLOC_DEBUG */
+  DB(old = gcheck(old); size += 2 * sizeof(U);)
+  op = ADDR2PAGE(old);
+  nl = SMLIST(size);
+  if (op->code == nl)
+    return 1;
+  if (op->code == BIG && nl == -1) {
+    size = (size + PAGESIZE - 1) / PAGESIZE;
+    if ((int)size > op->page - PAGENUM(old)) {
+      LOCK(NUMSMALL);
+      if (PAGEADDR(op->page) == membase->end ||
+          (tpg = NUM2PAGE(op->page))->code != BIG + FREE ||
+          tpg->page - PAGENUM(old) < (int)size) {
+        UNLOCK(NUMSMALL);
+        return 0;
+      }
+      fp_remove(FREEPAGE(op->page));
+      tpg->code = MIDDLE;
+      for (i = op->page, op->page = t = tpg->page; i < t; i++)
+        NUM2PAGE(i)->page = PAGENUM(old);
+      UNLOCK(NUMSMALL);
+    }
+    if ((int)size < op->page - PAGENUM(old)) {
+      LOCK(NUMSMALL);
+      t = PAGENUM(old) + size;
+      tpg = NUM2PAGE(t);
+      tpg->code = BIG;
+      tpg->page = op->page;
+      for (i = op->page - 1; i > t; i--)
+        NUM2PAGE(i)->page = t;
+      op->page = t;
+      UNLOCK(NUMSMALL);
+      pp = PAGEADDR(t);
+      DB(pp = gsetup(pp);)
+      free(pp);
+    }
+    assert((int)size == op->page - PAGENUM(old));
+    return 1;
+  }
+  return 0;
+}
+
+U msize(void *p) {
+  struct page *pg;
+  U size;
+
+#ifdef MALLOC_DEBUG
+  if ((U)p < (U)membase || (U)p >= (U)membase->end) {
+    printf("***Invalid pointer given to " S(msize) " %p\n", p);
+    abort();
+  }
+#endif /* MALLOC_DEBUG */
+  DB(p = gcheck(p);)
+  pg = ADDR2PAGE(p);
+  if (pg->code < NUMSMALL) {
+    assert(((U)p & (SIZE(pg->code) - 1)) == 0);
+    size = SIZE(pg->code);
+  } else {
+    assert((pg->code & ~FREE) == BIG);
+    assert(((U)p & (PAGESIZE - 1)) == 0);
+    size = (pg->page - PAGENUM(p)) * PAGESIZE;
+  }
+  DB(size -= 2 * sizeof(U));
+  return size;
+}
+
+void *realloc(void *old, size_t size) {
+  if (size == 0) {
+    free(old);
+    return 0;
+  } else if (!old) {
+    return malloc(size);
+  } else if (mresize(old, size)) {
+    return old;
+  } else {
+    U osize = msize(old);
+    void *new = malloc(size);
+
+    if (size > osize)
+      size = osize;
+    if (new) {
+      memcpy(new, old, size);
+      free(old);
+    }
+    return new;
+  }
+}
+
+void *calloc(size_t n1, size_t n2) {
+  U size = n1 * n2;
+  void *new = malloc(size);
+
+  if (new)
+    memset(new, 0, size);
+  return new;
+}
+
+void heapdump() {
+  void *p;
+  struct page *pg;
+  struct chunk *cp;
+  unsigned i, j, cnt, lbig = 0;
+  char buffer[64];
+  struct freepage *p1, *p2;
+
+  cnt = ((U)membase->end - (U)membase) / PAGESIZE;
+  printf("membase = %p, end = %p, %u pages total", membase, membase->end, cnt);
+  for (i = 0, p = membase; p < membase->end; p += PAGESIZE, i++) {
+    if (i % 8 == 0)
+      printf("\n0x%08lx: ", (U)p);
+    pg = NUM2PAGE(i);
+    if (pg->code < NUMSMALL) {
+      sprintf(buffer, "%d(%d)", SIZE(pg->code), pg->count);
+      printf("%8s", buffer);
+      lbig = 0;
+    } else
+      switch (pg->code & ~FREE) {
+      case BIG:
+        printf(" %c %-5d", pg->code & FREE ? 'F' : 'B', pg->page - i);
+        lbig = i;
+        break;
+      case MIDDLE:
+        if (pg->page == lbig)
+          printf("  <-->  ");
+        else
+          printf("  ?--?  ");
+        break;
+      case BASE:
+        printf(pg->code & FREE ? "  GAP   " : "  BASE  ");
+        lbig = 0;
+        break;
+      default:
+        printf("  ????  ");
+        lbig = 0;
+        break;
+      }
+  }
+  for (i = 0; i < NUMSMALL; i++) {
+    printf("\n\nSIZE %4d: local %d", SIZE(i), freelists[i].count);
+    if (freelists[i].free) {
+      printf("[%p", freelists[i].free);
+      for (cp = freelists[i].free->next; cp; cp = cp->next)
+        printf(", %p", cp);
+      printf("]");
+    }
+    if (freelists[i].aux) {
+      printf(" + [%p", freelists[i].aux);
+      for (cp = freelists[i].aux->next; cp; cp = cp->next)
+        printf(", %p", cp);
+      printf("]");
+    }
+    printf("\n\t   global ");
+    for (j = membase->freechunks[i]; j; j = pg->page) {
+      pg = NUM2PAGE(j);
+      if (pg->code != i)
+        printf("<WRONG CODE %d>", pg->code);
+      printf("%d[", pg->count);
+      if (pg->count)
+        for (cp = PAGEADDR(j) + pg->free * SIZE(i); cp; cp = cp->next)
+          printf("%p%s", cp, cp->next ? ", " : "]");
+      if (pg->page)
+        printf(" + ");
+    }
+  }
+  printf("\n\nBIG: ");
+  for (p1 = membase->freepages; p1; p1 = p1->bigger) {
+    printf("\t%d[%p", p1->size, p1);
+    for (p2 = p1->next; p2; p2 = p2->next)
+      printf(", %p", p2);
+    printf("]\n");
+  }
+  printf("\n\n\n");
+}
diff --git a/src/shm_malloc/malloc.h b/src/shm_malloc/malloc.h
new file mode 100644
index 0000000..9f564d3
--- /dev/null
+++ b/src/shm_malloc/malloc.h
@@ -0,0 +1,171 @@
+#ifndef _malloc_h_
+#define _malloc_h_
+#include <stddef.h>
+
+#if defined(LOCKTYPE) && LOCKTYPE == SYSVSEM
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#endif /* SYSVSEM */
+#if defined(LOCKTYPE) && LOCKTYPE == SPINLOCK
+#include "atomic.h"
+#endif /* SPINLOCK */
+#if defined(LOCKTYPE) && LOCKTYPE == PMUTEX
+#include <pthread.h>
+#endif /* PMUTEX */
+
+/* PAGESIZE MUST be a constant and MUST be a power of 2.  It may be larger
+** than the actual machine page size, but probably can't be smaller
+** Total heap memory is limited to PAGESIZE * 2^32, and must in fact be in
+** one contiguous extent that size or smaller (so if brk/sbrk gives you holes
+** you'll get less memory.)  PAGESIZE must be <= sizeof(void *) * 2^11, (8K
+** on a 32-bit machine, 16K on 64-bit) though the bitfields could be rearranged
+** to allow up to sizeof(void *) * 2^12 fairly easily.
+*/
+#if defined(__alpha__)
+#define PAGESIZE 8192
+#else
+#define PAGESIZE 4096
+#endif
+
+enum {
+  size4,
+  size8
+#if defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ > 4
+  = 0
+#endif
+  ,
+  size16
+#if defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ > 8
+  = 0
+#endif
+  ,
+  size32,
+  size64,
+  size128,
+  size256,
+  size512,
+  size1024,
+  size2048,
+  size4096,
+  size8192,
+  size16384,
+  size32768,
+  size65536,
+  BIG = 26,
+  MIDDLE = 28,
+  BASE = 30,
+  FREE = 1
+};
+#define CAT(A, B) A##B
+#define XCAT(A, B) CAT(A, B)
+
+/* number of small (<1 page) block sizes supported */
+#define NUMSMALL XCAT(size, PAGESIZE)
+#define LOG2PAGESIZE (NUMSMALL + (8 - size256))
+#define SIZE(l) ((1 << (8 - size256)) << (l))
+#define PERPAGE(l) ((PAGESIZE / (1 << (8 - size256))) >> (l))
+
+struct page {         /* descriptor for a page */
+  unsigned page;      /* page number of next page with same chunksize
+                       * for small chunk pages.
+                       * page number after end of extent for BIG
+                       * page number of start of extent for MIDDLE */
+  unsigned free : 11; /* offset of first free chunk on page.
+                       * 0 for non small chunk pages */
+  int count : 12;     /* number of free chunks on page.
+                       * 0 for non small chunk pages */
+  unsigned code : 8;  /* code describing size of objects on page
+                       * <BIG, page is broken down into chunks of the
+                       *       corresponding size from the enum; at
+                       *       least one chunk is in use
+                       * BIG, first page in extent of 1 or more pages
+                       * MIDDLE, non-first page in extent of 2+ pages
+                       * FREE, added to BIG/MIDDLE for free extents
+                       * BASE, holds struct page objects of pages in
+                       *       in the pool or refs to other BASE */
+  /* using a 32-bit page index here limits the pool to 4G pages (16TB with
+   * 4K pages).  We can shave bits from free/count/code to allow larger
+   * indexes and thus a bigger pool.  'free' needs to be
+   * log2(PAGESIZE/__SIZEOF_POINTER__) bits, while 'count' needs to be 1 bit
+   * larger for the sign.  'code' needs only 5 bits.  So with 8K pages on a
+   * 64 bit machine, we could shave 6 bits to add to 'page' -- 2PB max */
+};
+
+struct freepage {           /* descriptor for completely free page extent */
+  struct freepage **parent; /* pointer to this descriptor */
+  struct freepage *bigger;  /* next larger free extent */
+  struct freepage *next;    /* next same size free extent */
+  int size;                 /* number of pages in extent */
+};
+
+struct chunk {
+  struct chunk *next;
+};
+
+struct basepage {
+  char magic[4];          /* "SHM " */
+  unsigned char param[4]; /* param[0] = version (0)
+                           * param[1] = wordsize (sizeof(void *))
+                           * param[2] = LOCKTYPE + debug
+                           * param[3] = NUMSMALL */
+  void *base;             /* base address expected to load at */
+  struct page **pages;
+  struct freepage *freepages;
+  void *end;
+  unsigned freechunks[NUMSMALL];
+#ifdef SHM
+  volatile int init; /* set to 1 when init complete */
+  void *brk;         /* where the shm brk is */
+  void *eof;         /* end of the shm file */
+  void *global;      /* user global pointer */
+#endif               /* SHM */
+#if defined(LOCKTYPE) && LOCKTYPE == SYSVSEM
+  key_t semkey; /* semaphore key */
+#endif          /* SYSVSEM */
+#if defined(LOCKTYPE) && LOCKTYPE == SPINLOCK
+  volatile TAS_t locks[NUMSMALL + 1]; /* spinlocks */
+#endif                                /* SPINLOCK */
+#if defined(LOCKTYPE) && LOCKTYPE == PMUTEX
+  pthread_mutex_t locks[NUMSMALL + 1];
+#endif /* PMUTEX */
+#ifdef SHM
+  char mfile[256]; /* actually, all the rest of the page*/
+#endif             /* SHM */
+};
+
+struct localfree {
+  struct chunk *free, *aux;
+  unsigned short count, target;
+};
+
+extern void *malloc(size_t), *realloc(void *, size_t), free(void *),
+    *calloc(size_t, size_t);
+extern void *malloc_small(int); /* parameter is bucket number < NUMSMALL */
+extern void *valloc(size_t);    /* round up to page, page aligned */
+
+#define SMLIST(sz)                                                                              \
+  ((sz) <= 64                          ? sizeof(void *) <= 8 && (sz) <= 8                       \
+                                             ? sizeof(void *) <= 4 && (sz) <= 4 ? size4 : size8 \
+                                         : (sz) <= 16 ? size16                                  \
+                                         : (sz) <= 32 ? size32                                  \
+                                                      : size64                                  \
+   : (sz) <= 512                       ? (sz) <= 128   ? size128                                \
+                                         : (sz) <= 256 ? size256                                \
+                                                       : size512                                \
+   : PAGESIZE > 1024 && (sz) <= 1024   ? size1024                                               \
+   : PAGESIZE > 2048 && (sz) <= 2048   ? size2048                                               \
+   : PAGESIZE > 4096 && (sz) <= 4096   ? size4096                                               \
+   : PAGESIZE > 8192 && (sz) <= 8192   ? size8192                                               \
+   : PAGESIZE > 16384 && (sz) <= 16384 ? size16384                                              \
+   : PAGESIZE > 32768 && (sz) <= 32768 ? size32768                                              \
+                                       : -1)
+/* use the MALLOC macro with a CONSTANT argument for fast mallocs, less than
+** half a page.  Will crash if greater than half a page.  No speed advantage
+** if the argument is not constant */
+#ifndef MALLOC_DEBUG
+#define MALLOC(sz) malloc_small(SMLIST(sz))
+#else
+#define MALLOC(sz) (assert((sz) <= PAGESIZE / 2), malloc(sz))
+#endif
+
+#endif /* _malloc_h_ */
diff --git a/src/shm_malloc/shm_malloc.h b/src/shm_malloc/shm_malloc.h
new file mode 100644
index 0000000..3b5e737
--- /dev/null
+++ b/src/shm_malloc/shm_malloc.h
@@ -0,0 +1,28 @@
+#ifndef _shm_malloc_h_
+#define _shm_malloc_h_
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int shm_init(const char *, void (*)()), shm_fini(), shm_destroy(),
+    shm_child();
+
+extern void *shm_malloc(size_t), *shm_calloc(size_t, size_t),
+    *shm_realloc(void *, size_t), *shm_valloc(size_t);
+extern void shm_free(void *);
+extern int shm_mresize(void *, size_t);
+extern size_t shm_msize(void *);
+
+extern void *shm_global(), shm_set_global(void *);
+
+#define FIRST_TWO_ARGS(a, b, ...) a, b
+#define shm_init(...) shm_init(FIRST_TWO_ARGS(__VA_ARGS__, 0))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _shm_malloc_h_ */
diff --git a/src/shm_malloc/tshm1.c b/src/shm_malloc/tshm1.c
new file mode 100644
index 0000000..8b1b23b
--- /dev/null
+++ b/src/shm_malloc/tshm1.c
@@ -0,0 +1,63 @@
+#include "shm_malloc.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef SHM_FILE
+#define SHM_FILE "tshm1_file"
+#endif
+
+struct list {
+  struct list *next;
+  char *data;
+};
+
+struct head {
+  struct list *head;
+  struct list **tail;
+};
+
+void setup() {
+  struct head *h = shm_malloc(sizeof(struct head));
+  if (!h) {
+    perror("shm_malloc");
+    exit(1);
+  }
+  h->head = 0;
+  h->tail = &h->head;
+  shm_set_global(h);
+}
+
+int main(int ac, char **av) {
+  int i;
+  struct head *h;
+  struct list *l;
+
+  if (shm_init(SHM_FILE, setup) < 0) {
+    perror("shm_init");
+    exit(1);
+  }
+  h = shm_global();
+  if (!h) {
+    perror("shm_global");
+    exit(1);
+  }
+  for (i = 1; i < ac; i++) {
+    if (!(l = shm_malloc(sizeof(struct list)))) {
+      perror("shm_malloc");
+      exit(1);
+    }
+    l->next = 0;
+    if (!(l->data = shm_malloc(strlen(av[i]) + 1))) {
+      perror("shm_malloc");
+      exit(1);
+    }
+    strcpy(l->data, av[i]);
+    *h->tail = l;
+    h->tail = &l->next;
+  }
+  for (l = h->head; l; l = l->next) {
+    printf("%s\n", l->data);
+  }
+  return 0;
+}
diff --git a/src/shm_malloc/tshm2.c b/src/shm_malloc/tshm2.c
new file mode 100644
index 0000000..a8594b0
--- /dev/null
+++ b/src/shm_malloc/tshm2.c
@@ -0,0 +1,128 @@
+#include "shm_malloc.h"
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct list {
+  struct list *next;
+};
+
+#define MAX_PATTERNS 100
+struct pattern {
+  long size, count;
+} test[MAX_PATTERNS];
+int patterns = 0;
+
+long getval(char *s, char **p) {
+  long rv = strtol(s, p, 0);
+  char ch = **p;
+  if (ch == 'k' || ch == 'K') {
+    rv *= 1024;
+    ++*p;
+  } else if (ch == 'm' || ch == 'M') {
+    rv *= 1024 * 1024;
+    ++*p;
+  } else if (ch == 'g' || ch == 'G') {
+    rv *= 1024 * 1024 * 1024;
+    ++*p;
+  }
+  return rv;
+}
+
+int main(int ac, char **av) {
+  int i, j;
+  struct list *head = 0, **tail = &head, *l, *next;
+  const char *file = 0;
+  long total = 0, maxcnt = 0, count;
+  int suffix;
+
+  for (i = 1; i < ac; i++) {
+    if (isdigit(*av[i]) && patterns < MAX_PATTERNS) {
+      char *p;
+      long v = getval(av[i], &p);
+      if (*p == 'x' || *p == 'X') {
+        test[patterns].count = v;
+        test[patterns].size = getval(p + 1, &p);
+      } else {
+        test[patterns].count = 1;
+        test[patterns].size = v;
+      }
+      if (*p) {
+        fprintf(stderr, "ignoring bad pattern: %s\n", av[i]);
+      } else {
+        if (test[patterns].count > maxcnt)
+          maxcnt = test[patterns].count;
+        total += test[patterns].count * test[patterns].size;
+        patterns++;
+      }
+    } else if (!file) {
+      file = av[i];
+    } else {
+      patterns = 0;
+      break;
+    }
+  }
+  if (patterns == 0) {
+    fprintf(stderr, "usage: %s [file] pattern...\n", av[0]);
+    exit(1);
+  }
+  if (shm_init(file) < 0) {
+    perror("shm_init");
+    exit(1);
+  }
+  suffix = 0;
+  while (total > 2000 && suffix < 4) {
+    suffix++;
+    total = (total + 512) / 1024;
+  }
+  printf("Attempting %d patterns for %ld%c total\n", patterns, total,
+         " KMGT"[suffix]);
+  total = 0;
+  for (i = 0; i < maxcnt; i++)
+    for (j = 0; j < patterns; j++)
+      if (i < test[j].count) {
+        *tail = shm_malloc(test[j].size);
+        if (*tail) {
+          total += test[j].size;
+          tail = &(*tail)->next;
+        } else {
+          suffix = 0;
+          while (total > 2000 && suffix < 4) {
+            suffix++;
+            total = (total + 512) / 1024;
+          }
+          printf("Alloc failed after %ld%c\n", total, " KMGT"[suffix]);
+          i = maxcnt;
+          break;
+        }
+      }
+  *tail = 0;
+  printf("Done with allocation, now freeing\n");
+  count = 0;
+  for (l = head; l; l = next) {
+    next = l->next;
+    if (next) {
+      l->next = next->next;
+      shm_free(next);
+      if (++count == 1000) {
+        putchar('.');
+        fflush(stdout);
+        count = 0;
+      }
+      next = l->next;
+    }
+  }
+  printf("\nFreed half, now freeing remainder\n");
+  count = 0;
+  for (l = head; l; l = next) {
+    next = l->next;
+    shm_free(l);
+    if (++count == 1000) {
+      putchar('.');
+      fflush(stdout);
+      count = 0;
+    }
+  }
+  printf("\n");
+  return 0;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..ae6441d
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_executable(test test.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES})
+target_link_libraries(test shm pthread rt)
+
+add_executable(test_client test.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES})
+target_compile_definitions(test_client PUBLIC USE_CLIENT)
+target_link_libraries(test_client shm pthread rt)
+
+add_executable(test_spatter test_spatter.c parse-args.c json.c sp_alloc.c pcg_basic.c backend-support-tests.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES})
+target_compile_definitions(test_spatter PUBLIC USE_SERIAL)
+target_link_libraries(test_spatter shm pthread rt argtable3)
diff --git a/tests/backend-support-tests.c b/tests/backend-support-tests.c
new file mode 100644
index 0000000..c4bf540
--- /dev/null
+++ b/tests/backend-support-tests.c
@@ -0,0 +1,75 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+
+#include "backend-support-tests.h"
+
+int sg_cuda_support() {
+#if defined USE_CUDA
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int sg_opencl_support() {
+#if defined USE_OPENCL
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int sg_openmp_support() {
+#if defined USE_OPENMP
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int sg_serial_support() {
+#if defined USE_SERIAL
+  return 1;
+#else
+  return 0;
+#endif
+}
diff --git a/tests/json.c b/tests/json.c
new file mode 100644
index 0000000..679120d
--- /dev/null
+++ b/tests/json.c
@@ -0,0 +1,940 @@
+/* vim: set et ts=3 sw=3 sts=3 ft=c:
+ *
+ * Copyright (C) 2012, 2013, 2014 James McLaughlin et al.  All rights reserved.
+ * https://github.com/udp/json-parser
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "json.h"
+
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#endif
+
+const struct _json_value json_value_none;
+
+#include <ctype.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef unsigned int json_uchar;
+
+static unsigned char hex_value(json_char c) {
+  if (isdigit(c))
+    return c - '0';
+
+  switch (c) {
+  case 'a':
+  case 'A':
+    return 0x0A;
+  case 'b':
+  case 'B':
+    return 0x0B;
+  case 'c':
+  case 'C':
+    return 0x0C;
+  case 'd':
+  case 'D':
+    return 0x0D;
+  case 'e':
+  case 'E':
+    return 0x0E;
+  case 'f':
+  case 'F':
+    return 0x0F;
+  default:
+    return 0xFF;
+  }
+}
+
+typedef struct {
+  unsigned long used_memory;
+
+  unsigned int uint_max;
+  unsigned long ulong_max;
+
+  json_settings settings;
+  int first_pass;
+
+  const json_char *ptr;
+  unsigned int cur_line, cur_col;
+
+} json_state;
+
+static void *default_alloc(size_t size, int zero, void *user_data) {
+  return zero ? calloc(1, size) : malloc(size);
+}
+
+static void default_free(void *ptr, void *user_data) { free(ptr); }
+
+static void *json_alloc(json_state *state, unsigned long size, int zero) {
+  if ((state->ulong_max - state->used_memory) < size)
+    return 0;
+
+  if (state->settings.max_memory &&
+      (state->used_memory += size) > state->settings.max_memory) {
+    return 0;
+  }
+
+  return state->settings.mem_alloc(size, zero, state->settings.user_data);
+}
+
+static int new_value(json_state *state, json_value **top, json_value **root,
+                     json_value **alloc, json_type type) {
+  json_value *value;
+  int values_size;
+
+  if (!state->first_pass) {
+    value = *top = *alloc;
+    *alloc = (*alloc)->_reserved.next_alloc;
+
+    if (!*root)
+      *root = value;
+
+    switch (value->type) {
+    case json_array:
+
+      if (value->u.array.length == 0)
+        break;
+
+      if (!(value->u.array.values = (json_value **)json_alloc(
+                state, value->u.array.length * sizeof(json_value *), 0))) {
+        return 0;
+      }
+
+      value->u.array.length = 0;
+      break;
+
+    case json_object:
+
+      if (value->u.object.length == 0)
+        break;
+
+      values_size = sizeof(*value->u.object.values) * value->u.object.length;
+
+      if (!(value->u.object.values = (json_object_entry *)json_alloc(
+                state, values_size + ((unsigned long)value->u.object.values),
+                0))) {
+        return 0;
+      }
+
+      value->_reserved.object_mem =
+          (*(char **)&value->u.object.values) + values_size;
+
+      value->u.object.length = 0;
+      break;
+
+    case json_string:
+
+      if (!(value->u.string.ptr = (json_char *)json_alloc(
+                state, (value->u.string.length + 1) * sizeof(json_char), 0))) {
+        return 0;
+      }
+
+      value->u.string.length = 0;
+      break;
+
+    default:
+      break;
+    };
+
+    return 1;
+  }
+
+  if (!(value = (json_value *)json_alloc(
+            state, sizeof(json_value) + state->settings.value_extra, 1))) {
+    return 0;
+  }
+
+  if (!*root)
+    *root = value;
+
+  value->type = type;
+  value->parent = *top;
+
+#ifdef JSON_TRACK_SOURCE
+  value->line = state->cur_line;
+  value->col = state->cur_col;
+#endif
+
+  if (*alloc)
+    (*alloc)->_reserved.next_alloc = value;
+
+  *alloc = *top = value;
+
+  return 1;
+}
+
+#define whitespace                                                             \
+  case '\n':                                                                   \
+    ++state.cur_line;                                                          \
+    state.cur_col = 0;                                                         \
+  case ' ':                                                                    \
+  case '\t':                                                                   \
+  case '\r'
+
+#define string_add(b)                                                          \
+  do {                                                                         \
+    if (!state.first_pass)                                                     \
+      string[string_length] = b;                                               \
+    ++string_length;                                                           \
+  } while (0);
+
+#define line_and_col state.cur_line, state.cur_col
+
+static const long flag_next = 1 << 0, flag_reproc = 1 << 1,
+                  flag_need_comma = 1 << 2, flag_seek_value = 1 << 3,
+                  flag_escaped = 1 << 4, flag_string = 1 << 5,
+                  flag_need_colon = 1 << 6, flag_done = 1 << 7,
+                  flag_num_negative = 1 << 8, flag_num_zero = 1 << 9,
+                  flag_num_e = 1 << 10, flag_num_e_got_sign = 1 << 11,
+                  flag_num_e_negative = 1 << 12, flag_line_comment = 1 << 13,
+                  flag_block_comment = 1 << 14;
+
+json_value *json_parse_ex(json_settings *settings, const json_char *json,
+                          size_t length, char *error_buf) {
+  json_char error[json_error_max];
+  const json_char *end;
+  json_value *top, *root, *alloc = 0;
+  json_state state = {0};
+  long flags;
+  long num_digits = 0, num_e = 0;
+  json_int_t num_fraction = 0;
+
+  /* Skip UTF-8 BOM
+   */
+  if (length >= 3 && ((unsigned char)json[0]) == 0xEF &&
+      ((unsigned char)json[1]) == 0xBB && ((unsigned char)json[2]) == 0xBF) {
+    json += 3;
+    length -= 3;
+  }
+
+  error[0] = '\0';
+  end = (json + length);
+
+  memcpy(&state.settings, settings, sizeof(json_settings));
+
+  if (!state.settings.mem_alloc)
+    state.settings.mem_alloc = default_alloc;
+
+  if (!state.settings.mem_free)
+    state.settings.mem_free = default_free;
+
+  memset(&state.uint_max, 0xFF, sizeof(state.uint_max));
+  memset(&state.ulong_max, 0xFF, sizeof(state.ulong_max));
+
+  state.uint_max -= 8; /* limit of how much can be added before next check */
+  state.ulong_max -= 8;
+
+  for (state.first_pass = 1; state.first_pass >= 0; --state.first_pass) {
+    json_uchar uchar;
+    unsigned char uc_b1, uc_b2, uc_b3, uc_b4;
+    json_char *string = 0;
+    unsigned int string_length = 0;
+
+    top = root = 0;
+    flags = flag_seek_value;
+
+    state.cur_line = 1;
+
+    for (state.ptr = json;; ++state.ptr) {
+      json_char b = (state.ptr == end ? 0 : *state.ptr);
+
+      if (flags & flag_string) {
+        if (!b) {
+          sprintf(error, "Unexpected EOF in string (at %d:%d)", line_and_col);
+          goto e_failed;
+        }
+
+        if (string_length > state.uint_max)
+          goto e_overflow;
+
+        if (flags & flag_escaped) {
+          flags &= ~flag_escaped;
+
+          switch (b) {
+          case 'b':
+            string_add('\b');
+            break;
+          case 'f':
+            string_add('\f');
+            break;
+          case 'n':
+            string_add('\n');
+            break;
+          case 'r':
+            string_add('\r');
+            break;
+          case 't':
+            string_add('\t');
+            break;
+          case 'u':
+
+            if (end - state.ptr <= 4 ||
+                (uc_b1 = hex_value(*++state.ptr)) == 0xFF ||
+                (uc_b2 = hex_value(*++state.ptr)) == 0xFF ||
+                (uc_b3 = hex_value(*++state.ptr)) == 0xFF ||
+                (uc_b4 = hex_value(*++state.ptr)) == 0xFF) {
+              sprintf(error, "Invalid character value `%c` (at %d:%d)", b,
+                      line_and_col);
+              goto e_failed;
+            }
+
+            uc_b1 = (uc_b1 << 4) | uc_b2;
+            uc_b2 = (uc_b3 << 4) | uc_b4;
+            uchar = (uc_b1 << 8) | uc_b2;
+
+            if ((uchar & 0xF800) == 0xD800) {
+              json_uchar uchar2;
+
+              if (end - state.ptr <= 6 || (*++state.ptr) != '\\' ||
+                  (*++state.ptr) != 'u' ||
+                  (uc_b1 = hex_value(*++state.ptr)) == 0xFF ||
+                  (uc_b2 = hex_value(*++state.ptr)) == 0xFF ||
+                  (uc_b3 = hex_value(*++state.ptr)) == 0xFF ||
+                  (uc_b4 = hex_value(*++state.ptr)) == 0xFF) {
+                sprintf(error, "Invalid character value `%c` (at %d:%d)", b,
+                        line_and_col);
+                goto e_failed;
+              }
+
+              uc_b1 = (uc_b1 << 4) | uc_b2;
+              uc_b2 = (uc_b3 << 4) | uc_b4;
+              uchar2 = (uc_b1 << 8) | uc_b2;
+
+              uchar = 0x010000 | ((uchar & 0x3FF) << 10) | (uchar2 & 0x3FF);
+            }
+
+            if (sizeof(json_char) >= sizeof(json_uchar) || (uchar <= 0x7F)) {
+              string_add((json_char)uchar);
+              break;
+            }
+
+            if (uchar <= 0x7FF) {
+              if (state.first_pass)
+                string_length += 2;
+              else {
+                string[string_length++] = 0xC0 | (uchar >> 6);
+                string[string_length++] = 0x80 | (uchar & 0x3F);
+              }
+
+              break;
+            }
+
+            if (uchar <= 0xFFFF) {
+              if (state.first_pass)
+                string_length += 3;
+              else {
+                string[string_length++] = 0xE0 | (uchar >> 12);
+                string[string_length++] = 0x80 | ((uchar >> 6) & 0x3F);
+                string[string_length++] = 0x80 | (uchar & 0x3F);
+              }
+
+              break;
+            }
+
+            if (state.first_pass)
+              string_length += 4;
+            else {
+              string[string_length++] = 0xF0 | (uchar >> 18);
+              string[string_length++] = 0x80 | ((uchar >> 12) & 0x3F);
+              string[string_length++] = 0x80 | ((uchar >> 6) & 0x3F);
+              string[string_length++] = 0x80 | (uchar & 0x3F);
+            }
+
+            break;
+
+          default:
+            string_add(b);
+          };
+
+          continue;
+        }
+
+        if (b == '\\') {
+          flags |= flag_escaped;
+          continue;
+        }
+
+        if (b == '"') {
+          if (!state.first_pass)
+            string[string_length] = 0;
+
+          flags &= ~flag_string;
+          string = 0;
+
+          switch (top->type) {
+          case json_string:
+
+            top->u.string.length = string_length;
+            flags |= flag_next;
+
+            break;
+
+          case json_object:
+
+            if (state.first_pass)
+              (*(json_char **)&top->u.object.values) += string_length + 1;
+            else {
+              top->u.object.values[top->u.object.length].name =
+                  (json_char *)top->_reserved.object_mem;
+
+              top->u.object.values[top->u.object.length].name_length =
+                  string_length;
+
+              (*(json_char **)&top->_reserved.object_mem) += string_length + 1;
+            }
+
+            flags |= flag_seek_value | flag_need_colon;
+            continue;
+
+          default:
+            break;
+          };
+        } else {
+          string_add(b);
+          continue;
+        }
+      }
+
+      if (state.settings.settings & json_enable_comments) {
+        if (flags & (flag_line_comment | flag_block_comment)) {
+          if (flags & flag_line_comment) {
+            if (b == '\r' || b == '\n' || !b) {
+              flags &= ~flag_line_comment;
+              --state.ptr; /* so null can be reproc'd */
+            }
+
+            continue;
+          }
+
+          if (flags & flag_block_comment) {
+            if (!b) {
+              sprintf(error, "%d:%d: Unexpected EOF in block comment",
+                      line_and_col);
+              goto e_failed;
+            }
+
+            if (b == '*' && state.ptr < (end - 1) && state.ptr[1] == '/') {
+              flags &= ~flag_block_comment;
+              ++state.ptr; /* skip closing sequence */
+            }
+
+            continue;
+          }
+        } else if (b == '/') {
+          if (!(flags & (flag_seek_value | flag_done)) &&
+              top->type != json_object) {
+            sprintf(error, "%d:%d: Comment not allowed here", line_and_col);
+            goto e_failed;
+          }
+
+          if (++state.ptr == end) {
+            sprintf(error, "%d:%d: EOF unexpected", line_and_col);
+            goto e_failed;
+          }
+
+          switch (b = *state.ptr) {
+          case '/':
+            flags |= flag_line_comment;
+            continue;
+
+          case '*':
+            flags |= flag_block_comment;
+            continue;
+
+          default:
+            sprintf(error, "%d:%d: Unexpected `%c` in comment opening sequence",
+                    line_and_col, b);
+            goto e_failed;
+          };
+        }
+      }
+
+      if (flags & flag_done) {
+        if (!b)
+          break;
+
+        switch (b) {
+        whitespace:
+          continue;
+
+        default:
+
+          sprintf(error, "%d:%d: Trailing garbage: `%c`", state.cur_line,
+                  state.cur_col, b);
+
+          goto e_failed;
+        };
+      }
+
+      if (flags & flag_seek_value) {
+        switch (b) {
+        whitespace:
+          continue;
+
+        case ']':
+
+          if (top && top->type == json_array)
+            flags = (flags & ~(flag_need_comma | flag_seek_value)) | flag_next;
+          else {
+            sprintf(error, "%d:%d: Unexpected ]", line_and_col);
+            goto e_failed;
+          }
+
+          break;
+
+        default:
+
+          if (flags & flag_need_comma) {
+            if (b == ',') {
+              flags &= ~flag_need_comma;
+              continue;
+            } else {
+              sprintf(error, "%d:%d: Expected , before %c", state.cur_line,
+                      state.cur_col, b);
+
+              goto e_failed;
+            }
+          }
+
+          if (flags & flag_need_colon) {
+            if (b == ':') {
+              flags &= ~flag_need_colon;
+              continue;
+            } else {
+              sprintf(error, "%d:%d: Expected : before %c", state.cur_line,
+                      state.cur_col, b);
+
+              goto e_failed;
+            }
+          }
+
+          flags &= ~flag_seek_value;
+
+          switch (b) {
+          case '{':
+
+            if (!new_value(&state, &top, &root, &alloc, json_object))
+              goto e_alloc_failure;
+
+            continue;
+
+          case '[':
+
+            if (!new_value(&state, &top, &root, &alloc, json_array))
+              goto e_alloc_failure;
+
+            flags |= flag_seek_value;
+            continue;
+
+          case '"':
+
+            if (!new_value(&state, &top, &root, &alloc, json_string))
+              goto e_alloc_failure;
+
+            flags |= flag_string;
+
+            string = top->u.string.ptr;
+            string_length = 0;
+
+            continue;
+
+          case 't':
+
+            if ((end - state.ptr) < 3 || *(++state.ptr) != 'r' ||
+                *(++state.ptr) != 'u' || *(++state.ptr) != 'e') {
+              goto e_unknown_value;
+            }
+
+            if (!new_value(&state, &top, &root, &alloc, json_boolean))
+              goto e_alloc_failure;
+
+            top->u.boolean = 1;
+
+            flags |= flag_next;
+            break;
+
+          case 'f':
+
+            if ((end - state.ptr) < 4 || *(++state.ptr) != 'a' ||
+                *(++state.ptr) != 'l' || *(++state.ptr) != 's' ||
+                *(++state.ptr) != 'e') {
+              goto e_unknown_value;
+            }
+
+            if (!new_value(&state, &top, &root, &alloc, json_boolean))
+              goto e_alloc_failure;
+
+            flags |= flag_next;
+            break;
+
+          case 'n':
+
+            if ((end - state.ptr) < 3 || *(++state.ptr) != 'u' ||
+                *(++state.ptr) != 'l' || *(++state.ptr) != 'l') {
+              goto e_unknown_value;
+            }
+
+            if (!new_value(&state, &top, &root, &alloc, json_null))
+              goto e_alloc_failure;
+
+            flags |= flag_next;
+            break;
+
+          default:
+
+            if (isdigit(b) || b == '-') {
+              if (!new_value(&state, &top, &root, &alloc, json_integer))
+                goto e_alloc_failure;
+
+              if (!state.first_pass) {
+                while (isdigit(b) || b == '+' || b == '-' || b == 'e' ||
+                       b == 'E' || b == '.') {
+                  if ((++state.ptr) == end) {
+                    b = 0;
+                    break;
+                  }
+
+                  b = *state.ptr;
+                }
+
+                flags |= flag_next | flag_reproc;
+                break;
+              }
+
+              flags &= ~(flag_num_negative | flag_num_e | flag_num_e_got_sign |
+                         flag_num_e_negative | flag_num_zero);
+
+              num_digits = 0;
+              num_fraction = 0;
+              num_e = 0;
+
+              if (b != '-') {
+                flags |= flag_reproc;
+                break;
+              }
+
+              flags |= flag_num_negative;
+              continue;
+            } else {
+              sprintf(error, "%d:%d: Unexpected %c when seeking value",
+                      line_and_col, b);
+              goto e_failed;
+            }
+          };
+        };
+      } else {
+        switch (top->type) {
+        case json_object:
+
+          switch (b) {
+          whitespace:
+            continue;
+
+          case '"':
+
+            if (flags & flag_need_comma) {
+              sprintf(error, "%d:%d: Expected , before \"", line_and_col);
+              goto e_failed;
+            }
+
+            flags |= flag_string;
+
+            string = (json_char *)top->_reserved.object_mem;
+            string_length = 0;
+
+            break;
+
+          case '}':
+
+            flags = (flags & ~flag_need_comma) | flag_next;
+            break;
+
+          case ',':
+
+            if (flags & flag_need_comma) {
+              flags &= ~flag_need_comma;
+              break;
+            }
+
+          default:
+            sprintf(error, "%d:%d: Unexpected `%c` in object", line_and_col, b);
+            goto e_failed;
+          };
+
+          break;
+
+        case json_integer:
+        case json_double:
+
+          if (isdigit(b)) {
+            ++num_digits;
+
+            if (top->type == json_integer || flags & flag_num_e) {
+              if (!(flags & flag_num_e)) {
+                if (flags & flag_num_zero) {
+                  sprintf(error, "%d:%d: Unexpected `0` before `%c`",
+                          line_and_col, b);
+                  goto e_failed;
+                }
+
+                if (num_digits == 1 && b == '0')
+                  flags |= flag_num_zero;
+              } else {
+                flags |= flag_num_e_got_sign;
+                num_e = (num_e * 10) + (b - '0');
+                continue;
+              }
+
+              top->u.integer = (top->u.integer * 10) + (b - '0');
+              continue;
+            }
+
+            num_fraction = (num_fraction * 10) + (b - '0');
+            continue;
+          }
+
+          if (b == '+' || b == '-') {
+            if ((flags & flag_num_e) && !(flags & flag_num_e_got_sign)) {
+              flags |= flag_num_e_got_sign;
+
+              if (b == '-')
+                flags |= flag_num_e_negative;
+
+              continue;
+            }
+          } else if (b == '.' && top->type == json_integer) {
+            if (!num_digits) {
+              sprintf(error, "%d:%d: Expected digit before `.`", line_and_col);
+              goto e_failed;
+            }
+
+            top->type = json_double;
+            top->u.dbl = (double)top->u.integer;
+
+            num_digits = 0;
+            continue;
+          }
+
+          if (!(flags & flag_num_e)) {
+            if (top->type == json_double) {
+              if (!num_digits) {
+                sprintf(error, "%d:%d: Expected digit after `.`", line_and_col);
+                goto e_failed;
+              }
+
+              top->u.dbl +=
+                  ((double)num_fraction) / (pow(10.0, (double)num_digits));
+            }
+
+            if (b == 'e' || b == 'E') {
+              flags |= flag_num_e;
+
+              if (top->type == json_integer) {
+                top->type = json_double;
+                top->u.dbl = (double)top->u.integer;
+              }
+
+              num_digits = 0;
+              flags &= ~flag_num_zero;
+
+              continue;
+            }
+          } else {
+            if (!num_digits) {
+              sprintf(error, "%d:%d: Expected digit after `e`", line_and_col);
+              goto e_failed;
+            }
+
+            top->u.dbl *= pow(
+                10.0, (double)(flags & flag_num_e_negative ? -num_e : num_e));
+          }
+
+          if (flags & flag_num_negative) {
+            if (top->type == json_integer)
+              top->u.integer = -top->u.integer;
+            else
+              top->u.dbl = -top->u.dbl;
+          }
+
+          flags |= flag_next | flag_reproc;
+          break;
+
+        default:
+          break;
+        };
+      }
+
+      if (flags & flag_reproc) {
+        flags &= ~flag_reproc;
+        --state.ptr;
+      }
+
+      if (flags & flag_next) {
+        flags = (flags & ~flag_next) | flag_need_comma;
+
+        if (!top->parent) {
+          /* root value done */
+
+          flags |= flag_done;
+          continue;
+        }
+
+        if (top->parent->type == json_array)
+          flags |= flag_seek_value;
+
+        if (!state.first_pass) {
+          json_value *parent = top->parent;
+
+          switch (parent->type) {
+          case json_object:
+
+            parent->u.object.values[parent->u.object.length].value = top;
+
+            break;
+
+          case json_array:
+
+            parent->u.array.values[parent->u.array.length] = top;
+
+            break;
+
+          default:
+            break;
+          };
+        }
+
+        if ((++top->parent->u.array.length) > state.uint_max)
+          goto e_overflow;
+
+        top = top->parent;
+
+        continue;
+      }
+    }
+
+    alloc = root;
+  }
+
+  return root;
+
+e_unknown_value:
+
+  sprintf(error, "%d:%d: Unknown value", line_and_col);
+  goto e_failed;
+
+e_alloc_failure:
+
+  strcpy(error, "Memory allocation failure");
+  goto e_failed;
+
+e_overflow:
+
+  sprintf(error, "%d:%d: Too long (caught overflow)", line_and_col);
+  goto e_failed;
+
+e_failed:
+
+  if (error_buf) {
+    if (*error)
+      strcpy(error_buf, error);
+    else
+      strcpy(error_buf, "Unknown error");
+  }
+
+  if (state.first_pass)
+    alloc = root;
+
+  while (alloc) {
+    top = alloc->_reserved.next_alloc;
+    state.settings.mem_free(alloc, state.settings.user_data);
+    alloc = top;
+  }
+
+  if (!state.first_pass)
+    json_value_free_ex(&state.settings, root);
+
+  return 0;
+}
+
+json_value *json_parse(const json_char *json, size_t length) {
+  json_settings settings = {0};
+  return json_parse_ex(&settings, json, length, 0);
+}
+
+void json_value_free_ex(json_settings *settings, json_value *value) {
+  json_value *cur_value;
+
+  if (!value)
+    return;
+
+  value->parent = 0;
+
+  while (value) {
+    switch (value->type) {
+    case json_array:
+
+      if (!value->u.array.length) {
+        settings->mem_free(value->u.array.values, settings->user_data);
+        break;
+      }
+
+      value = value->u.array.values[--value->u.array.length];
+      continue;
+
+    case json_object:
+
+      if (!value->u.object.length) {
+        settings->mem_free(value->u.object.values, settings->user_data);
+        break;
+      }
+
+      value = value->u.object.values[--value->u.object.length].value;
+      continue;
+
+    case json_string:
+
+      settings->mem_free(value->u.string.ptr, settings->user_data);
+      break;
+
+    default:
+      break;
+    };
+
+    cur_value = value;
+    value = value->parent;
+    settings->mem_free(cur_value, settings->user_data);
+  }
+}
+
+void json_value_free(json_value *value) {
+  json_settings settings = {0};
+  settings.mem_free = default_free;
+  json_value_free_ex(&settings, value);
+}
diff --git a/tests/parse-args.c b/tests/parse-args.c
new file mode 100644
index 0000000..539ef88
--- /dev/null
+++ b/tests/parse-args.c
@@ -0,0 +1,1447 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+
+#include "parse-args.h"
+#include "argtable3.h"
+#include "backend-support-tests.h"
+#include "json.h"
+#include "pcg_basic.h"
+#include "sp_alloc.h"
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+
+#ifdef USE_CUDA
+#include "../src/cuda/cuda-backend.h"
+#endif
+
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_PAPI
+#include "papi_helper.h"
+int papi_nevents;
+char papi_event_names[PAPI_MAX_COUNTERS][STRING_SIZE];
+#endif
+
+#define INTERACTIVE "INTERACTIVE"
+
+char platform_string[STRING_SIZE];
+char device_string[STRING_SIZE];
+char kernel_file[STRING_SIZE];
+char kernel_name[STRING_SIZE];
+char jsonfilename[STRING_SIZE];
+char op_string[STRING_SIZE];
+
+int cuda_dev = -1;
+int validate_flag = 0;
+int quiet_flag = 0;
+int aggregate_flag = 1;
+int compress_flag = 0;
+int stride_kernel = -1;
+
+enum sg_backend backend = INVALID_BACKEND;
+
+// These should actually stay global
+int verbose;
+FILE *err_file;
+
+void safestrcopy(char *dest, const char *src);
+void parse_p(char *, struct run_config *, int mode);
+ssize_t setincludes(size_t key, size_t *set, size_t set_len);
+void xkp_pattern(size_t *pat, size_t dim);
+void parse_backend(int argc, char **argv);
+
+void **argtable;
+unsigned int number_of_arguments = 35;
+struct arg_lit *verb, *help, *interactive, *validate, *aggregate, *compress;
+struct arg_str *backend_arg, *cl_platform, *cl_device, *pattern,
+    *pattern_gather, *pattern_scatter, *kernelName, *delta, *delta_gather,
+    *delta_scatter, *name, *papi, *op;
+struct arg_int *count, *wrap, *runs, *omp_threads, *vector_len,
+    *local_work_size, *shared_memory, *morton, *hilbert, *roblock, *stride,
+    *random_arg, *no_print_header;
+struct arg_file *kernelFile;
+struct arg_end *end;
+
+void initialize_argtable() {
+  // Initialize the argtable on the stack just because it is easier and how the
+  // documentation handles it
+  void **malloc_argtable =
+      (void **)malloc(sizeof(void *) * number_of_arguments);
+
+  // Arguments that do not take parameters
+  malloc_argtable[0] = help = arg_litn(
+      NULL, "help", 0, 1, "Displays info about commands and then exits.");
+  malloc_argtable[1] = verb = arg_litn(
+      NULL, "verbose", 0, 1,
+      "Print info about default arguments that you have not overridden.");
+  malloc_argtable[2] = no_print_header = arg_intn(
+      "q", "no-print-header", "<n>", 0, 1, "Do not print header information.");
+  malloc_argtable[3] = interactive =
+      arg_litn("i", "interactive", 0, 1,
+               "Pick the platform and the device interactively.");
+  malloc_argtable[4] = validate =
+      arg_litn(NULL, "validate", 0, 1,
+               "Perform extra validation checks to ensure data validity");
+  malloc_argtable[5] = aggregate =
+      arg_litn("a", "aggregate", 0, 1,
+               "Report a minimum time for all runs of a given configuration "
+               "for 2 or more runs. [Default 1] (Do not use with PAPI)");
+  malloc_argtable[6] = compress = arg_litn("c", "compress", 0, 1, "TODO");
+  // Benchmark Configuration
+  malloc_argtable[7] = pattern = arg_strn(
+      "p", "pattern", "<pattern>", 0, 1,
+      "Specify either a built-in pattern (i.e. UNIFORM), a custom pattern "
+      "(i.e. 1,2,3,4), or a path to a json file with a run-configuration.");
+  malloc_argtable[8] = pattern_gather =
+      arg_strn("g", "pattern-gather", "<pattern>", 0, 1,
+               "Valid wtih [kernel-name: GS]. Specify either a built-in "
+               "pattern (i.e. UNIFORM), a custom pattern (i.e. 1,2,3,4), or a "
+               "path to a json file with a run-configuration.");
+  malloc_argtable[9] = pattern_scatter =
+      arg_strn("h", "pattern-scatter", "<pattern>", 0, 1,
+               "Valid with [kernel-name: GS]. Specify either a built-in "
+               "pattern (i.e. UNIFORM), a custom pattern (i.e. 1,2,3,4), or a "
+               "path to a json file with a run-configuration.");
+  malloc_argtable[10] = kernelName =
+      arg_strn("k", "kernel-name", "<kernel>", 0, 1,
+               "Specify the kernel you want to run. [Default: Gather]");
+  malloc_argtable[11] = op = arg_strn("o", "op", "<s>", 0, 1, "TODO");
+  malloc_argtable[12] = delta =
+      arg_strn("d", "delta", "<delta[,delta,...]>", 0, 1,
+               "Specify one or more deltas. [Default: 8]");
+  malloc_argtable[13] = delta_gather =
+      arg_strn("x", "delta gather", "<delta[,delta,...]>", 0, 1,
+               "Specify one or more deltas. [Default: 8]");
+  malloc_argtable[14] = delta_scatter =
+      arg_strn("y", "delta scatter", "<delta[,delta,...]>", 0, 1,
+               "Specify one or more deltas. [Default: 8]");
+  malloc_argtable[15] = count = arg_intn(
+      "l", "count", "<n>", 0, 1, "Number of Gathers or Scatters to perform.");
+  malloc_argtable[16] = wrap =
+      arg_intn("w", "wrap", "<n>", 0, 1,
+               "Number of independent slots in the small buffer (source buffer "
+               "if Scatter, Target buffer if Gather. [Default: 1]");
+  malloc_argtable[17] = runs = arg_intn(
+      "R", "runs", "<n>", 0, 1,
+      "Number of times to repeat execution of the kernel. [Default: 10]");
+  malloc_argtable[18] = omp_threads =
+      arg_intn("t", "omp-threads", "<n>", 0, 1,
+               "Number of OpenMP threads. [Default: OMP_MAX_THREADS]");
+  malloc_argtable[19] = vector_len =
+      arg_intn("v", "vector-len", "<n>", 0, 1, "TODO");
+  malloc_argtable[20] = local_work_size = arg_intn(
+      "z", "local-work-size", "<n>", 0, 1,
+      "Numer of Gathers or Scatters performed by each thread on a GPU.");
+  malloc_argtable[21] = shared_memory =
+      arg_intn("m", "shared-memory", "<n>", 0, 1,
+               "Amount of dummy shared memory to allocate on GPUs (used for "
+               "occupancy control).");
+  malloc_argtable[22] = name =
+      arg_strn("n", "name", "<name>", 0, 1,
+               "Specify and name this configuration in the output.");
+  malloc_argtable[23] = random_arg =
+      arg_intn("s", "random", "<n>", 0, 1,
+               "Sets the seed, or uses a random one if no seed is specified.");
+  malloc_argtable[24] = backend_arg =
+      arg_strn("b", "backend", "<backend>", 0, 1,
+               "Specify a backend: OpenCL, OpenMP, CUDA, or Serial.");
+  malloc_argtable[25] = cl_platform = arg_strn(
+      NULL, "cl-platform", "<platform>", 0, 1,
+      "Specify platform if using OpenCL (case-insensitive, fuzzy matching).");
+  malloc_argtable[26] = cl_device = arg_strn(
+      NULL, "cl-device", "<device>", 0, 1,
+      "Specify device if using OpenCL (case-insensitive, fuzzy matching).");
+  malloc_argtable[27] = kernelFile =
+      arg_filen("f", "kernel-file", "<FILE>", 0, 1,
+                "Specify the location of an OpenCL kernel file.");
+  // Other Configurations
+  malloc_argtable[28] = morton = arg_intn(NULL, "morton", "<n>", 0, 1, "TODO");
+  malloc_argtable[29] = hilbert =
+      arg_intn(NULL, "hilbert", "<n>", 0, 1, "TODO");
+  malloc_argtable[30] = roblock =
+      arg_intn(NULL, "roblock", "<n>", 0, 1, "TODO");
+  malloc_argtable[31] = stride = arg_intn(NULL, "stride", "<n>", 0, 1, "TODO");
+  malloc_argtable[32] = papi = arg_strn(NULL, "papi", "<s>", 0, 1, "TODO");
+  malloc_argtable[33] = end = arg_end(20);
+
+  // Random has an option to provide an argument. Default its value to -1.
+  random_arg->hdr.flag |= ARG_HASOPTVALUE;
+  random_arg->ival[0] = -1;
+
+  // Set default values
+  kernelName->sval[0] = "Gather\0";
+  delta->sval[0] = "8\0";
+  delta_gather->sval[0] = "8\0";
+  delta_scatter->sval[0] = "8\0";
+  wrap->ival[0] = 1;
+  runs->ival[0] = 10;
+
+  // Set the global argtable equal to the malloc argtable
+  argtable = malloc_argtable;
+}
+
+void copy_str_ignore_leading_space(char *dest, const char *source) {
+  if (source[0] == ' ')
+    safestrcopy(dest, &source[1]);
+  else
+    safestrcopy(dest, source);
+}
+
+int get_num_configs(json_value *value) {
+  if (value->type != json_array) {
+    error("get_num_configs was not passed an array", ERROR);
+  }
+
+  return value->u.array.length;
+}
+
+void parse_json_kernel(json_object_entry cur, char **argv, int i) {
+  if (!strcasecmp(cur.value->u.string.ptr, "SCATTER") ||
+      !strcasecmp(cur.value->u.string.ptr, "GATHER") ||
+      !strcasecmp(cur.value->u.string.ptr, "GS")) {
+    error("Ambiguous Kernel Type: Assuming kernel-name option.", WARN);
+    snprintf(argv[i + 1], STRING_SIZE, "--kernel-name=%s",
+             cur.value->u.string.ptr);
+  } else {
+    error("Ambigous Kernel Type: Assuming kernel-file option.", WARN);
+    snprintf(argv[i + 1], STRING_SIZE, "--kernel-file=%s",
+             cur.value->u.string.ptr);
+  }
+}
+
+void parse_json_array(json_object_entry cur, char **argv, int i) {
+  int index = 0;
+  index += snprintf(argv[i + 1], STRING_SIZE, "--%s=", cur.name);
+  printf("argv[%d]: %s\n", i + 1, argv[i + 1]);
+
+  for (int j = 0; j < cur.value->u.array.length; j++) {
+    if (cur.value->u.array.values[j]->type != json_integer) {
+      error("Encountered non-integer json type while parsing array", ERROR);
+    }
+
+    char buffer[STRING_SIZE];
+    int check = snprintf(buffer, STRING_SIZE, "%zd",
+                         cur.value->u.array.values[j]->u.integer);
+    int added = snprintf(buffer, STRING_SIZE - index, "%zd",
+                         cur.value->u.array.values[j]->u.integer);
+
+    if (check == added) {
+      index += snprintf(&argv[i + 1][index], STRING_SIZE - index, "%zd",
+                        cur.value->u.array.values[j]->u.integer);
+
+      if (index >= STRING_SIZE - 1) {
+        break;
+      } else if (j != cur.value->u.array.length - 1 &&
+                 index < STRING_SIZE - 1) {
+        index += snprintf(&argv[i + 1][index], STRING_SIZE - index, ",");
+      }
+
+    } else {
+      index--;
+      argv[i + 1][index] = '\0';
+      break;
+    }
+  }
+}
+
+struct run_config *parse_json_config(json_value *value) {
+
+  struct run_config *rc =
+      (struct run_config *)calloc(1, sizeof(struct run_config));
+
+  if (!value)
+    error("parse_json_config passed NULL pointer", ERROR);
+
+  if (value->type != json_object)
+    error("parse_json_config should only be passed json_objects", ERROR);
+
+  int argc = value->u.object.length + 1;
+  char **argv = (char **)sp_malloc(sizeof(char *), argc * 2, ALIGN_CACHE);
+
+  for (int i = 0; i < argc; i++)
+    argv[i] = (char *)sp_malloc(1, STRING_SIZE * 2, ALIGN_CACHE);
+
+  for (int i = 0; i < argc - 1; i++) {
+    json_object_entry cur = value->u.object.values[i];
+
+    if (cur.value->type == json_string) {
+      if (!strcasecmp(cur.name, "kernel")) {
+        parse_json_kernel(cur, argv, i);
+      } else {
+        snprintf(argv[i + 1], STRING_SIZE, "--%s=%s", cur.name,
+                 cur.value->u.string.ptr);
+      }
+    } else if (cur.value->type == json_integer) {
+      snprintf(argv[i + 1], STRING_SIZE, "--%s=%zd", cur.name,
+               cur.value->u.integer);
+    } else if (cur.value->type == json_array) {
+      parse_json_array(cur, argv, i);
+    } else {
+      error("Unexpected json type", ERROR);
+    }
+  }
+
+  // yeah its hacky - parse_args ignores the first arg
+  safestrcopy(argv[0], argv[1]);
+
+  int nerrors = arg_parse(argc, argv, argtable);
+
+  if (nerrors > 0) {
+    arg_print_errors(stdout, end, "Spatter");
+    printf("Error while parsing json file.\n");
+    exit(0);
+  }
+
+  rc = parse_runs(argc, argv);
+
+  for (int i = 0; i < argc; i++)
+    free(argv[i]);
+
+  free(argv);
+
+  return rc;
+}
+
+void parse_args(int argc, char **argv, int *nrc, struct run_config **rc) {
+  initialize_argtable();
+  int nerrors = arg_parse(argc, argv, argtable);
+
+  if (help->count > 0) {
+    printf("Usage:\n");
+    arg_print_syntax(stdout, argtable, "\n");
+    arg_print_glossary(stdout, argtable, " %-28s %s\n");
+    exit(0);
+  }
+
+  if (nerrors > 0) {
+    arg_print_errors(stdout, end, "Spatter");
+    printf("Try './spatter --help' for more information.\n");
+    exit(0);
+  }
+
+  parse_backend(argc, argv);
+
+  // Parse command-line arguments to in case of specified json file.
+  int json = 0;
+
+  if (pattern->count > 0) {
+    if (strstr(pattern->sval[0], "FILE")) {
+      safestrcopy(jsonfilename, strchr(pattern->sval[0], '=') + 1);
+      printf("Reading patterns from %s.\n", jsonfilename);
+      json = 1;
+    }
+  }
+
+  if (json) {
+    FILE *fp;
+    struct stat filestatus;
+    int file_size;
+    char *file_contents;
+    json_char *json;
+    json_value *value;
+
+    if (stat(jsonfilename, &filestatus) != 0)
+      error("Json file not found", ERROR);
+
+    file_size = filestatus.st_size;
+    file_contents = (char *)sp_malloc(file_size, 1 + 1, ALIGN_CACHE);
+
+    fp = fopen(jsonfilename, "rt");
+    if (!fp)
+      error("Unable to open Json file", ERROR);
+
+    if (fread(file_contents, file_size, 1, fp) != 1) {
+      fclose(fp);
+      error("Unable to read content of Json file", ERROR);
+    }
+    fclose(fp);
+
+    json = (json_char *)file_contents;
+    value = json_parse(json, file_size);
+
+    if (!value)
+      error("Unable to parse Json file", ERROR);
+
+    // This is the number of specified runs in the json file.
+    *nrc = get_num_configs(value);
+
+    *rc = (struct run_config *)sp_calloc(sizeof(struct run_config), *nrc,
+                                         ALIGN_CACHE);
+
+    for (int i = 0; i < *nrc; i++) {
+      struct run_config *rctemp = parse_json_config(value->u.array.values[i]);
+      rc[0][i] = *rctemp;
+      free(rctemp);
+    }
+
+    json_value_free(value);
+    free(file_contents);
+  } else {
+    *rc = (struct run_config *)sp_calloc(sizeof(struct run_config), 1,
+                                         ALIGN_CACHE);
+    rc[0][0] = *parse_runs(argc, argv);
+    *nrc = 1;
+  }
+
+  free(argtable);
+
+  return;
+}
+
+struct run_config *parse_runs(int argc, char **argv) {
+  int pattern_found = 0;
+  int pattern_scatter_found = 0;
+  int pattern_gather_found = 0;
+
+  struct run_config *rc =
+      (struct run_config *)calloc(1, sizeof(struct run_config));
+  rc->delta = -1;
+  rc->delta_gather = -1;
+  rc->delta_scatter = -1;
+  rc->stride_kernel = -1;
+  rc->ro_block = 1;
+  rc->ro_order = NULL;
+#ifdef USE_OPENMP
+  rc->omp_threads = omp_get_max_threads();
+#else
+  rc->omp_threads = 1;
+#endif
+  rc->kernel = INVALID_KERNEL;
+  safestrcopy(rc->name, "NONE");
+
+  if (kernelName->count > 0) {
+    copy_str_ignore_leading_space(kernel_name, kernelName->sval[0]);
+    if (!strcasecmp("GS", kernel_name))
+      rc->kernel = GS;
+    else if (!strcasecmp("SCATTER", kernel_name))
+      rc->kernel = SCATTER;
+    else if (!strcasecmp("GATHER", kernel_name))
+      rc->kernel = GATHER;
+    else {
+      char output[STRING_SIZE];
+      sprintf(output, "Invalid kernel %s\n", kernel_name);
+      error(output, ERROR);
+    }
+  }
+
+  if (op->count > 0) {
+    copy_str_ignore_leading_space(op_string, op->sval[0]);
+    if (!strcasecmp("COPY", op_string))
+      rc->op = OP_COPY;
+    else if (!strcasecmp("ACCUM", op_string))
+      rc->op = OP_ACCUM;
+    else
+      error("Unrecognzied op type", ERROR);
+  }
+
+  if (random_arg->count > 0) {
+    // Parsing the seed parameter
+    // If no argument was passed, use the current time in seconds since the
+    // epoch as the random seed
+    if (random_arg->ival[0] == -1)
+      rc->random_seed = time(NULL);
+    else
+      // sscanf(optarg, "%zu", rc->random_seed);
+      rc->random_seed = random_arg->ival[0];
+  }
+
+  if (omp_threads->count > 0)
+    rc->omp_threads = omp_threads->ival[0];
+
+  if (vector_len->count > 0) {
+    rc->vector_len = vector_len->ival[0];
+    if (rc->vector_len < 1)
+      error("Invalid vector len!", ERROR);
+  }
+
+  if (runs->count > 0)
+    rc->nruns = runs->ival[0];
+
+  if (wrap->count > 0)
+    rc->wrap = wrap->ival[0];
+
+  if (count->count > 0)
+    rc->generic_len = count->ival[0];
+
+  if (local_work_size->count > 0)
+    rc->local_work_size = local_work_size->ival[0];
+
+  if (shared_memory->count > 0)
+    rc->shmem = shared_memory->ival[0];
+
+  if (name->count > 0)
+    copy_str_ignore_leading_space(rc->name, name->sval[0]);
+
+  if (pattern->count > 0) {
+    copy_str_ignore_leading_space(rc->generator, pattern->sval[0]);
+    // char* filePtr = strstr(rc->generator, "FILE");
+    // if (filePtr)
+    //     safestrcopy(rc->generator, filePtr);
+    parse_p(rc->generator, rc, 0);
+    pattern_found = 1;
+  }
+
+  if (pattern_gather->count > 0) {
+    copy_str_ignore_leading_space(rc->generator, pattern_gather->sval[0]);
+    parse_p(rc->generator, rc, 1);
+    pattern_gather_found = 1;
+  }
+
+  if (pattern_scatter->count > 0) {
+    copy_str_ignore_leading_space(rc->generator, pattern_scatter->sval[0]);
+    parse_p(rc->generator, rc, 2);
+    pattern_scatter_found = 1;
+  }
+
+  if (delta->count > 0) {
+    char delta_temp[STRING_SIZE];
+    copy_str_ignore_leading_space(delta_temp, delta->sval[0]);
+    char *delim = ",";
+    char *ptr = strtok(delta_temp, delim);
+    if (!ptr)
+      error("Pattern not found", ERROR);
+
+    spIdx_t *mydeltas;
+    spIdx_t *mydeltas_ps;
+
+    mydeltas = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+    mydeltas_ps = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+
+    size_t read = 0;
+    if (sscanf(ptr, "%zu", &(mydeltas[read++])) < 1)
+      error("Failed to parse first pattern element in deltas", ERROR);
+
+    while ((ptr = strtok(NULL, delim)) && read < MAX_PATTERN_LEN) {
+      if (sscanf(ptr, "%zu", &(mydeltas[read++])) < 1)
+        error("Failed to parse pattern", ERROR);
+    }
+
+    rc->deltas = mydeltas;
+    rc->deltas_ps = mydeltas_ps;
+    rc->deltas_len = read;
+
+    // rotate
+    for (size_t i = 0; i < rc->deltas_len; i++)
+      rc->deltas_ps[i] =
+          rc->deltas[((i - 1) + rc->deltas_len) % rc->deltas_len];
+
+    // compute prefix-sum
+    for (size_t i = 1; i < rc->deltas_len; i++)
+      rc->deltas_ps[i] += rc->deltas_ps[i - 1];
+
+    // compute max
+    size_t m = rc->deltas_ps[0];
+    for (size_t i = 1; i < rc->deltas_len; i++) {
+      if (rc->deltas_ps[i] > m)
+        m = rc->deltas_ps[i];
+    }
+    rc->delta = m;
+  }
+
+  if (delta_gather->count > 0) {
+    char delta_gather_temp[STRING_SIZE];
+    copy_str_ignore_leading_space(delta_gather_temp, delta_gather->sval[0]);
+    char *delim_gather = ",";
+    char *ptr_gather = strtok(delta_gather_temp, delim_gather);
+    if (!ptr_gather)
+      error("Pattern not found", ERROR);
+
+    spIdx_t *mydeltas_gather;
+    spIdx_t *mydeltas_gather_ps;
+
+    mydeltas_gather = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+    mydeltas_gather_ps =
+        sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+
+    size_t read_gather = 0;
+    if (sscanf(ptr_gather, "%zu", &(mydeltas_gather[read_gather++])) < 1)
+      error("Failed to parse first pattern element in deltas", ERROR);
+
+    while ((ptr_gather = strtok(NULL, delim_gather)) &&
+           read_gather < MAX_PATTERN_LEN) {
+      if (sscanf(ptr_gather, "%zu", &(mydeltas_gather[read_gather++])) < 1)
+        error("Failed to parse pattern", ERROR);
+    }
+
+    rc->deltas_gather = mydeltas_gather;
+    rc->deltas_gather_ps = mydeltas_gather_ps;
+    rc->deltas_gather_len = read_gather;
+
+    // rotate
+    for (size_t i = 0; i < rc->deltas_gather_len; i++)
+      rc->deltas_gather_ps[i] =
+          rc->deltas_gather[((i - 1) + rc->deltas_gather_len) %
+                            rc->deltas_gather_len];
+
+    // compute prefix-sum
+    for (size_t i = 1; i < rc->deltas_gather_len; i++)
+      rc->deltas_gather_ps[i] += rc->deltas_gather_ps[i - 1];
+
+    // compute max
+    size_t m = rc->deltas_gather_ps[0];
+    for (size_t i = 1; i < rc->deltas_gather_len; i++) {
+      if (rc->deltas_gather_ps[i] > m)
+        m = rc->deltas_gather_ps[i];
+    }
+    rc->delta_gather = m;
+  }
+
+  if (delta_scatter->count > 0) {
+    char delta_scatter_temp[STRING_SIZE];
+    copy_str_ignore_leading_space(delta_scatter_temp, delta_scatter->sval[0]);
+    char *delim_scatter = ",";
+    char *ptr_scatter = strtok(delta_scatter_temp, delim_scatter);
+    if (!ptr_scatter)
+      error("Pattern not found", ERROR);
+
+    spIdx_t *mydeltas_scatter;
+    spIdx_t *mydeltas_scatter_ps;
+
+    mydeltas_scatter = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+    mydeltas_scatter_ps =
+        sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+
+    size_t read_scatter = 0;
+    if (sscanf(ptr_scatter, "%zu", &(mydeltas_scatter[read_scatter++])) < 1)
+      error("Failed to parse first pattern element in deltas", ERROR);
+
+    while ((ptr_scatter = strtok(NULL, delim_scatter)) &&
+           read_scatter < MAX_PATTERN_LEN) {
+      if (sscanf(ptr_scatter, "%zu", &(mydeltas_scatter[read_scatter++])) < 1)
+        error("Failed to parse pattern", ERROR);
+    }
+
+    rc->deltas_scatter = mydeltas_scatter;
+    rc->deltas_scatter_ps = mydeltas_scatter_ps;
+    rc->deltas_scatter_len = read_scatter;
+
+    // rotate
+    for (size_t i = 0; i < rc->deltas_scatter_len; i++)
+      rc->deltas_scatter_ps[i] =
+          rc->deltas_scatter[((i - 1) + rc->deltas_scatter_len) %
+                             rc->deltas_scatter_len];
+
+    // compute prefix-sum
+    for (size_t i = 1; i < rc->deltas_scatter_len; i++)
+      rc->deltas_scatter_ps[i] += rc->deltas_scatter_ps[i - 1];
+
+    // compute max
+    size_t m = rc->deltas_scatter_ps[0];
+    for (size_t i = 1; i < rc->deltas_scatter_len; i++) {
+      if (rc->deltas_scatter_ps[i] > m)
+        m = rc->deltas_scatter_ps[i];
+    }
+    rc->delta_scatter = m;
+  }
+
+  if (morton->count > 0)
+    rc->ro_morton = morton->ival[0];
+
+  if (hilbert->count > 0)
+    rc->ro_hilbert = hilbert->ival[0];
+
+  if (roblock->count > 0)
+    rc->ro_block = roblock->ival[0];
+
+  if (stride->count > 0)
+    rc->stride_kernel = stride->ival[0];
+
+  // VALIDATE ARGUMENTS
+  if (rc->kernel != GS && !pattern_found)
+    error("Please specify a pattern", ERROR);
+
+  if ((rc->kernel == GS && !pattern_scatter_found) ||
+      (rc->kernel == GS && !pattern_gather_found))
+    error("Please specify a gather pattern and a scatter pattern for an GS "
+          "kernel",
+          ERROR);
+
+  if (rc->kernel == GS && (rc->pattern_gather_len != rc->pattern_scatter_len))
+    error("Gather pattern and scatter pattern must have the same length",
+          ERROR);
+
+  if (rc->vector_len == 0) {
+    error("Vector length not set. Default is 1", WARN);
+    rc->vector_len = 1;
+  }
+
+  if (rc->wrap == 0) {
+    error("length of smallbuf not specified. Default is 1 (slot of size "
+          "pattern_len elements)",
+          WARN);
+    rc->wrap = 1;
+  }
+
+  if (rc->nruns == 0) {
+    error("Number of runs not specified. Default is 10 ", WARN);
+    rc->nruns = 10;
+  }
+
+  if (rc->generic_len == 0) {
+    error("Length not specified. Default is 1024 (gathers/scatters)", WARN);
+    rc->generic_len = 1024;
+  }
+
+  if (rc->kernel == INVALID_KERNEL) {
+    error("Kernel unspecified, guess GATHER", WARN);
+    rc->kernel = GATHER;
+    safestrcopy(kernel_name, "gather");
+  }
+
+  if (rc->kernel == SCATTER)
+    sprintf(kernel_name, "%s%zu", "scatter", rc->vector_len);
+  else if (rc->kernel == GATHER)
+    sprintf(kernel_name, "%s%zu", "gather", rc->vector_len);
+  else if (rc->kernel == GS)
+    sprintf(kernel_name, "%s%zu", "sg", rc->vector_len);
+
+  if (pattern_found) {
+    if (rc->delta <= -1) {
+      error("delta not specified, default is 8\n", WARN);
+      rc->delta = 8;
+      rc->deltas_len = 1;
+    }
+  }
+
+  if (pattern_gather_found) {
+    if (rc->delta_gather <= -1) {
+      error("delta gather not specified, default is 8\n", WARN);
+      rc->delta_gather = 8;
+      rc->deltas_gather_len = 1;
+    }
+  }
+
+  if (pattern_scatter_found) {
+    if (rc->delta_scatter <= -1) {
+      error("delta scatter not specified, default is 8\n", WARN);
+      rc->delta_scatter = 8;
+      rc->deltas_scatter_len = 1;
+    }
+  }
+
+  if (rc->op != OP_COPY)
+    error("OP must be OP_COPY", WARN);
+
+  if (!strcasecmp(rc->name, "NONE")) {
+    if (rc->type != CUSTOM)
+      safestrcopy(rc->name, rc->generator);
+    else
+      safestrcopy(rc->name, "CUSTOM");
+  }
+
+#ifdef USE_OPENMP
+  int max_threads = omp_get_max_threads();
+  if (rc->omp_threads > max_threads) {
+    error("Too many OpenMP threads requested, using the max instead", WARN);
+    rc->omp_threads = max_threads;
+  }
+  if (rc->omp_threads == 0) {
+    error("Number of OpenMP threads not specified, using the max", WARN);
+    rc->omp_threads = max_threads;
+  }
+#else
+  if (rc->omp_threads > 1)
+    error("Compiled without OpenMP support but requsted more than 1 thread, "
+          "using 1 instead",
+          WARN);
+#endif
+
+#if defined USE_CUDA || defined USE_OPENCL
+  if (rc->local_work_size == 0) {
+    error("Local_work_size not set. Default is 1", WARN);
+    rc->local_work_size = 1;
+  }
+#endif
+  return rc;
+}
+
+ssize_t power(int base, int exp) {
+  int i, result = 1;
+  for (i = 0; i < exp; i++)
+    result *= base;
+  return result;
+}
+
+// Yes, there is no need for recursion here but I did this in python first. I'll
+// update this later with a cleaner implementation
+void static laplacian_branch(int depth, int order, int n, int **pos,
+                             int *pos_len) {
+  *pos = (int *)realloc(*pos, ((*pos_len) + order) * sizeof(int));
+
+  for (int i = 0; i < order; i++) {
+    (*pos)[i + *pos_len] = (i + 1) * power(n, depth);
+  }
+
+  *pos_len += order;
+  return;
+}
+
+void static laplacian(int dim, int order, int n, struct run_config *rc,
+                      int mode) {
+  spIdx_t **pattern;
+  spSize_t *pattern_len;
+
+  if (mode == 0) { // Normal pattern
+    pattern = &rc->pattern;
+    pattern_len = &rc->pattern_len;
+  } else if (mode == 1) { // Gather pattern (GS Kernel)
+    pattern = &rc->pattern_gather;
+    pattern_len = &rc->pattern_gather_len;
+  } else if (mode == 2) { // Scatter pattern (GS Kernel)
+    pattern = &rc->pattern_scatter;
+    pattern_len = &rc->pattern_scatter_len;
+  } else {
+    printf("laplacian: invalid mode %d\n", mode);
+    exit(1);
+  }
+
+  if (dim < 1) {
+    error("laplacian: dim must be positive", ERROR);
+  }
+
+  int final_len = dim * order * 2 + 1;
+  if (final_len > MAX_PATTERN_LEN) {
+    error("laplacian: resulting pattern too long", ERROR);
+  }
+
+  int pos_len = 0;
+  int *pos = NULL;
+
+  for (int i = 0; i < dim; i++) {
+    laplacian_branch(i, order, n, &pos, &pos_len);
+  }
+
+  *pattern_len = final_len;
+
+  *pattern = sp_calloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE);
+
+  int max = pos[pos_len - 1];
+
+  for (int i = 0; i < *pattern_len; i++) {
+    (*pattern)[i] = 2;
+  }
+
+  // populate rc->pattern
+  for (int i = 0; i < pos_len; i++) {
+    (*pattern)[i] = (-pos[pos_len - i - 1] + max);
+  }
+
+  (*pattern)[pos_len] = max;
+
+  for (int i = 0; i < pos_len; i++) {
+    (*pattern)[pos_len + 1 + i] = pos[i] + max;
+  }
+
+  free(pos);
+  return;
+}
+
+void parse_backend(int argc, char **argv) {
+  err_file = stderr;
+
+  safestrcopy(platform_string, "NONE");
+  safestrcopy(device_string, "NONE");
+  safestrcopy(kernel_file, "NONE");
+  safestrcopy(kernel_name, "NONE");
+
+  if (backend_arg->count > 0) {
+    if (!strcasecmp("OPENCL", backend_arg->sval[0]))
+      backend = OPENCL;
+    else if (!strcasecmp("OPENMP", backend_arg->sval[0]))
+      backend = OPENMP;
+    else if (!strcasecmp("CUDA", backend_arg->sval[0]))
+      backend = CUDA;
+    else if (!strcasecmp("SERIAL", backend_arg->sval[0]))
+      backend = SERIAL;
+    else
+      error("Unrecognized Backend", ERROR);
+  }
+
+  if (cl_platform->count > 0)
+    copy_str_ignore_leading_space(platform_string, cl_platform->sval[0]);
+
+  if (cl_device->count > 0)
+    copy_str_ignore_leading_space(device_string, cl_device->sval[0]);
+
+  if (interactive->count > 0) {
+    safestrcopy(platform_string, INTERACTIVE);
+    safestrcopy(device_string, INTERACTIVE);
+  }
+
+  if (kernelFile->count > 0)
+    copy_str_ignore_leading_space(kernel_file, kernelFile->filename[0]);
+
+  if (no_print_header->count > 0)
+    quiet_flag = no_print_header->ival[0];
+
+  if (validate->count > 0)
+    validate_flag++;
+
+  if (aggregate->count > 0)
+    aggregate_flag = 1;
+
+  if (compress->count > 0)
+    compress_flag = 1;
+
+  if (papi->count > 0) {
+#ifdef USE_PAPI
+    {
+      char *pch = strtok(papi->sval[0], ",");
+      while (pch != NULL) {
+        safestrcopy(papi_event_names[papi_nevents++], pch);
+        pch = strtok(NULL, ",");
+        if (papi_nevents == PAPI_MAX_COUNTERS)
+          break;
+      }
+    }
+#endif
+  }
+
+  /* Check argument coherency */
+  if (backend == INVALID_BACKEND) {
+    if (sg_cuda_support()) {
+      backend = CUDA;
+      error("No backend specified, guessing CUDA", WARN);
+    } else if (sg_opencl_support()) {
+      backend = OPENCL;
+      error("No backend specified, guessing OpenCL", WARN);
+    } else if (sg_openmp_support()) {
+      backend = OPENMP;
+      error("No backend specified, guessing OpenMP", WARN);
+    } else if (sg_serial_support()) {
+      backend = SERIAL;
+      error("No backend specified, guessing Serial", WARN);
+    } else
+      error("No backends available! Please recompile spatter with at least one "
+            "backend.",
+            ERROR);
+  }
+
+  // Check to see if they compiled with support for their requested backend
+  if (backend == OPENCL) {
+    if (!sg_opencl_support())
+      error("You did not compile with support for OpenCL", ERROR);
+  } else if (backend == OPENMP) {
+    if (!sg_openmp_support())
+      error("You did not compile with support for OpenMP", ERROR);
+  } else if (backend == CUDA) {
+    if (!sg_cuda_support())
+      error("You did not compile with support for CUDA", ERROR);
+  } else if (backend == SERIAL) {
+    if (!sg_serial_support())
+      error("You did not compile with support for serial execution", ERROR);
+  }
+
+  if (backend == OPENCL) {
+    if (!strcasecmp(platform_string, "NONE")) {
+      safestrcopy(platform_string, INTERACTIVE);
+      safestrcopy(device_string, INTERACTIVE);
+    }
+    if (!strcasecmp(device_string, "NONE")) {
+      safestrcopy(platform_string, INTERACTIVE);
+      safestrcopy(device_string, INTERACTIVE);
+    }
+  }
+
+#ifdef USE_CUDA
+  if (backend == CUDA) {
+    int dev = find_device_cuda(device_string);
+    if (dev == -1) {
+      error("Specified CUDA device not found or no device specified. Using "
+            "device 0",
+            WARN);
+      dev = 0;
+    }
+    cuda_dev = dev;
+    cudaSetDevice(dev);
+  }
+#endif
+
+  if (!strcasecmp(kernel_file, "NONE") && backend == OPENCL) {
+    error("Kernel file unspecified, guessing kernels/kernels_vector.cl", WARN);
+    safestrcopy(kernel_file, "kernels/kernels_vector.cl");
+  }
+
+  return;
+}
+
+void parse_p(char *optarg, struct run_config *rc, int mode) {
+  spIdx_t **pattern;
+  spSize_t *pattern_len;
+  ssize_t *delta;
+  size_t **deltas;
+  size_t *deltas_len;
+
+  if (mode == 0) { // Normal pattern
+    pattern = &rc->pattern;
+    pattern_len = &rc->pattern_len;
+    delta = &rc->delta;
+    deltas = &rc->deltas_gather;
+    deltas_len = &rc->deltas_len;
+  } else if (mode == 1) { // Gather pattern (GS Kernel)
+    pattern = &rc->pattern_gather;
+    pattern_len = &rc->pattern_gather_len;
+    delta = &rc->delta_gather;
+    deltas = &rc->deltas_gather;
+    deltas_len = &rc->deltas_gather_len;
+  } else if (mode == 2) { // Scatter pattern (GS Kernel)
+    pattern = &rc->pattern_scatter;
+    pattern_len = &rc->pattern_scatter_len;
+    delta = &rc->delta_scatter;
+    deltas = &rc->deltas_scatter;
+    deltas_len = &rc->deltas_scatter_len;
+  } else {
+    printf("parse_p: invalid mode %d\n", mode);
+    exit(1);
+  }
+
+  rc->type = INVALID_IDX;
+  char *arg = 0;
+  if ((arg = strchr(optarg, ':'))) {
+    *arg = '\0';
+    arg++; // arg now points to arguments to the pattern type
+
+    // FILE mode indicates that we will load a
+    // config from a file
+    if (!strcmp(optarg, "FILE")) {
+      // TODO
+      // safestrcopy(idx_pattern_file, arg);
+      rc->type = CONFIG_FILE;
+    }
+
+    // The Exxon Kernel Proxy-derived stencil
+    // It used to be called HYDRO so we will accept that too
+    // XKP:dim
+    else if (!strcmp(optarg, "XKP") || !strcmp(optarg, "HYDRO")) {
+      rc->type = XKP;
+
+      size_t dim = 0;
+      char *dim_char = strtok(arg, ":");
+      if (!dim_char)
+        error("XKP: size not found", 1);
+      if (sscanf(dim_char, "%zu", &dim) < 1)
+        error("XKP: Dimension not parsed", 1);
+
+      *pattern_len = 73;
+
+      *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE);
+
+      // The default delta is 1
+      *delta = 1;
+
+      if (!(*deltas)) {
+        *deltas = sp_malloc(sizeof(size_t), 1, ALIGN_CACHE);
+      }
+      *deltas[0] = *delta;
+      *deltas_len = 1;
+
+      xkp_pattern(*pattern, dim);
+    }
+
+    // Parse Uniform Stride Arguments, which are
+    // UNIFORM:index_length:stride
+    else if (!strcmp(optarg, "UNIFORM")) {
+      rc->type = UNIFORM;
+
+      // Read the length
+      char *len = strtok(arg, ":");
+      if (!len)
+        error("UNIFORM: Index Length not found", 1);
+      if (sscanf(len, "%zu", &(*pattern_len)) < 1)
+        error("UNIFORM: Length not parsed", 1);
+
+      *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE);
+
+      // Read the stride
+      char *stride = strtok(NULL, ":");
+      ssize_t strideval = 0;
+      if (!stride)
+        error("UNIFORM: Stride not found", 1);
+      if (sscanf(stride, "%zd", &strideval) < 1)
+        error("UNIFORM: Stride not parsed", 1);
+
+      // Fill the pattern buffer
+      for (int i = 0; i < *pattern_len; i++)
+        (*pattern)[i] = i * strideval;
+
+      char *delta2 = strtok(NULL, ":");
+      if (delta2) {
+        if (!*deltas) {
+          *deltas = sp_malloc(sizeof(size_t), 1, ALIGN_CACHE);
+        }
+        *deltas_len = 1;
+
+        if (!strcmp(delta2, "NR")) {
+          *delta = strideval * (*pattern_len);
+          (*deltas)[0] = *delta;
+        } else {
+          if (sscanf(delta2, "%zd", &(*delta)) < 1)
+            error("UNIFORM: delta not parsed", 1);
+          (*deltas)[0] = *delta;
+        }
+      }
+
+    }
+
+    // LAPLACIAN:DIM:ORDER:N
+    else if (!strcmp(optarg, "LAPLACIAN")) {
+      int dim_val, order_val, problem_size_val;
+
+      *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE);
+      rc->type = LAPLACIAN;
+
+      // Read the dimension
+      char *dim = strtok(arg, ":");
+      if (!dim)
+        error("LAPLACIAN: Dimension not found", 1);
+      if (sscanf(dim, "%d", &dim_val) < 1)
+        error("LAPLACIAN: Dimension not parsed", 1);
+
+      // Read the order
+      char *order = strtok(NULL, ":");
+      if (!order)
+        error("LAPLACIAN: Order not found", 1);
+      if (sscanf(order, "%d", &order_val) < 1)
+        error("LAPLACIAN: Order not parsed", 1);
+
+      // Read the problem size
+      char *problem_size = strtok(NULL, ":");
+      if (!problem_size)
+        error("LAPLACIAN: Problem size not found", 1);
+      if (sscanf(problem_size, "%d", &problem_size_val) < 1)
+        error("LAPLACIAN: Problem size not parsed", 1);
+
+      *delta = 1;
+      if (!(*deltas)) {
+        *deltas = sp_malloc(sizeof(spIdx_t), *delta, ALIGN_CACHE);
+      }
+      (*deltas)[0] = *delta;
+      *deltas_len = 1;
+
+      laplacian(dim_val, order_val, problem_size_val, rc, mode);
+    }
+
+    // Mostly Stride 1 Mode
+    // Arguments: index_length:list_of_breaks:list_of_deltas
+    // list_of_deltas should be length 1 or the same length as
+    // list_of_breaks.
+    // The elements of both lists should be nonnegative and
+    // the the elements of list_of_breaks should be strictly less
+    // than index_length
+    else if (!strcmp(optarg, "MS1")) {
+      rc->type = MS1;
+
+      char *len = strtok(arg, ":");
+      char *breaks = strtok(NULL, ":");
+      char *gaps = strtok(NULL, ":");
+
+      size_t *ms1_breaks =
+          sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+      size_t *ms1_deltas =
+          sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+      size_t ms1_breaks_len = 0;
+      size_t ms1_deltas_len = 0;
+
+      // Parse index length
+      sscanf(len, "%zu", &(*pattern_len));
+      *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE);
+
+      // Parse breaks
+      char *ptr = strtok(breaks, ",");
+      size_t read = 0;
+      if (!ptr)
+        error("MS1: Breaks missing", 1);
+      if (sscanf(ptr, "%zu", &(ms1_breaks[read++])) < 1)
+        error("MS1: Failed to parse first break", 1);
+
+      while ((ptr = strtok(NULL, ",")) && read < MAX_PATTERN_LEN) {
+        if (sscanf(ptr, "%zu", &(ms1_breaks[read++])) < 1)
+          error("MS1: Failed to parse breaks", 1);
+      }
+
+      ms1_breaks_len = read;
+
+      if (!gaps) {
+        printf("1\n");
+        error("error", ERROR);
+      }
+
+      ptr = strtok(gaps, ",");
+      read = 0;
+      if (ptr) {
+        if (sscanf(ptr, "%zu", &(ms1_deltas[read++])) < 1)
+          error("Failed to parse first delta", 1);
+
+        while ((ptr = strtok(NULL, ",")) && read < MAX_PATTERN_LEN) {
+          if (sscanf(ptr, "%zu", &(ms1_deltas[read++])) < 1)
+            error("Failed to parse deltas", 1);
+        }
+      } else
+        error("MS1: deltas missing", 1);
+
+      ms1_deltas_len = read;
+
+      (*pattern)[0] = -1;
+      size_t last = -1;
+      ssize_t j;
+      for (int i = 0; i < *pattern_len; i++) {
+        if ((j = setincludes(i, ms1_breaks, ms1_breaks_len)) != -1)
+          (*pattern)[i] = last + ms1_deltas[ms1_deltas_len > 1 ? j : 0];
+        else
+          (*pattern)[i] = last + 1;
+        last = (*pattern)[i];
+      }
+
+      free(ms1_breaks);
+      free(ms1_deltas);
+    } else
+      error("Unrecognized mode in -p argument", 1);
+  }
+
+  // CUSTOM mode means that the user supplied a single index buffer on the
+  // command line
+  else {
+    if (quiet_flag > 3) {
+      printf("Parse P Custom Pattern: %s\n", optarg);
+    }
+    rc->type = CUSTOM;
+    char *delim = ",";
+    char *ptr = strtok(optarg, delim);
+    size_t read = 0;
+    if (!ptr)
+      error("Pattern not found", 1);
+
+    spIdx_t *mypat;
+
+    mypat = sp_malloc(sizeof(spIdx_t), MAX_PATTERN_LEN, ALIGN_CACHE);
+
+    if (sscanf(ptr, "%zu", &(mypat[read++])) < 1)
+      error("Failed to parse first pattern element in custom mode", 1);
+
+    while ((ptr = strtok(NULL, delim)) && read < MAX_PATTERN_LEN) {
+      if (sscanf(ptr, "%zu", &(mypat[read++])) < 1)
+        error("Failed to parse pattern", 1);
+    }
+    *pattern = mypat;
+    *pattern_len = read;
+  }
+
+  if (*pattern_len == 0)
+    error("Pattern length of 0", ERROR);
+
+  if (rc->type == INVALID_IDX)
+    error("No pattern type set", ERROR);
+}
+
+ssize_t setincludes(size_t key, size_t *set, size_t set_len) {
+  for (size_t i = 0; i < set_len; i++) {
+    if (set[i] == key)
+      return i;
+  }
+  return -1;
+}
+
+void print_run_config(struct run_config rc) {
+  printf("Index: %zu ", rc.pattern_len);
+  printf("[");
+  for (size_t i = 0; i < rc.pattern_len; i++) {
+    printf("%zu", rc.pattern[i]);
+    if (i != rc.pattern_len - 1)
+      printf(" ");
+  }
+  printf("]\n");
+  if (rc.deltas_len > 0) {
+    printf("Deltas: %zu ", rc.deltas_len);
+    printf("[");
+    for (size_t i = 0; i < rc.deltas_len; i++) {
+      printf("%zu", rc.deltas[i]);
+      if (i != rc.deltas_len - 1)
+        printf(" ");
+    }
+    printf("]\n");
+    printf("Deltas_ps: %zu ", rc.deltas_len);
+    printf("[");
+    for (size_t i = 0; i < rc.deltas_len; i++) {
+      printf("%zu", rc.deltas_ps[i]);
+      if (i != rc.deltas_len - 1)
+        printf(" ");
+    }
+    printf("] (%zu)\n", rc.delta);
+  } else
+    printf("Delta: %zu\n", rc.delta);
+
+  printf("kern: %s\n", kernel_name);
+  printf("genlen: %zu\n", rc.generic_len);
+}
+
+void error(char *what, int code) {
+  if (code == ERROR)
+    fprintf(err_file, "Error: ");
+  else if (code == WARN) {
+    if (verbose)
+      fprintf(err_file, "Warning: ");
+  }
+
+  if (verbose || code) {
+    fprintf(err_file, "%s", what);
+    fprintf(err_file, "\n");
+  }
+
+  if (code)
+    exit(code);
+}
+
+void safestrcopy(char *dest, const char *src) {
+  dest[0] = '\0';
+  strncat(dest, src, STRING_SIZE - 1);
+}
+
+int compare_ssizet(const void *a, const void *b) {
+  if (*(ssize_t *)a > *(ssize_t *)b)
+    return 1;
+  else if (*(ssize_t *)a < *(ssize_t *)b)
+    return -1;
+  else
+    return 0;
+}
+
+void copy4(ssize_t *dest, ssize_t *a, int *off) {
+  for (int i = 0; i < 4; i++) {
+    dest[i + *off] = a[i];
+  }
+  *off += 4;
+}
+
+void add4(ssize_t *dest, ssize_t *a, ssize_t *b, int *off) {
+  for (int i = 0; i < 4; i++) {
+    dest[i + *off] = a[i] + b[i];
+  }
+  *off += 4;
+}
+
+void xkp_pattern(size_t *pat_, size_t dim) {
+  ssize_t pat[73];
+  for (int i = 0; i < 73; i++) {
+    pat[i] = i;
+  }
+
+  ssize_t Xp[4];
+  ssize_t Xn[4];
+  ssize_t Yp[4];
+  ssize_t Yn[4];
+  ssize_t Zp[4];
+  ssize_t Zn[4];
+
+  Xp[0] = 1;
+  Xp[1] = 2;
+  Xp[2] = 3;
+  Xp[3] = 4;
+  Xn[0] = -1;
+  Xn[1] = -2;
+  Xn[2] = -3;
+  Xn[3] = -4;
+  Yp[0] = dim;
+  Yp[1] = 2 * dim;
+  Yp[2] = 3 * dim;
+  Yp[3] = 4 * dim;
+  Yn[0] = -dim;
+  Yn[1] = -2 * dim;
+  Yn[2] = -3 * dim;
+  Yn[3] = -4 * dim;
+  Zp[0] = dim * dim;
+  Zp[1] = 2 * dim * dim;
+  Zp[2] = 3 * dim * dim;
+  Zp[3] = 4 * dim * dim;
+  Zn[0] = -dim * dim;
+  Zn[1] = -2 * dim * dim;
+  Zn[2] = -3 * dim * dim;
+  Zn[3] = -4 * dim * dim;
+
+  int idx = 0;
+  pat[idx++] = 0;
+  copy4(pat, Xp, &idx);
+  copy4(pat, Xn, &idx);
+  copy4(pat, Yp, &idx);
+  copy4(pat, Yn, &idx);
+  copy4(pat, Zp, &idx);
+  copy4(pat, Zn, &idx);
+
+  add4(pat, Xp, Yp, &idx);
+  add4(pat, Xp, Zp, &idx);
+  add4(pat, Xp, Yn, &idx);
+  add4(pat, Xp, Zn, &idx);
+
+  add4(pat, Xn, Yp, &idx);
+  add4(pat, Xn, Zp, &idx);
+  add4(pat, Xn, Yn, &idx);
+  add4(pat, Xn, Zn, &idx);
+
+  add4(pat, Yp, Zp, &idx);
+  add4(pat, Yp, Zn, &idx);
+  add4(pat, Yn, Zp, &idx);
+  add4(pat, Yn, Zn, &idx);
+
+  qsort(pat, 73, sizeof(ssize_t), compare_ssizet);
+
+  ssize_t min = pat[0];
+  for (int i = 1; i < 73; i++) {
+    if (pat[i] < min) {
+      min = pat[i];
+    }
+  }
+
+  for (int i = 0; i < 73; i++) {
+    pat[i] -= min;
+  }
+
+  for (int i = 0; i < 73; i++) {
+    pat_[i] = pat[i];
+  }
+}
diff --git a/tests/pcg_basic.c b/tests/pcg_basic.c
new file mode 100644
index 0000000..86e4ff3
--- /dev/null
+++ b/tests/pcg_basic.c
@@ -0,0 +1,106 @@
+/*
+ * PCG Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *       http://www.pcg-random.org
+ */
+
+/*
+ * This code is derived from the full C implementation, which is in turn
+ * derived from the canonical C++ PCG implementation. The C++ version
+ * has many additional features and is preferable if you can use C++ in
+ * your project.
+ */
+
+#include "pcg_basic.h"
+
+// state for global RNGs
+
+static pcg32_random_t pcg32_global = PCG32_INITIALIZER;
+
+// pcg32_srandom(initstate, initseq)
+// pcg32_srandom_r(rng, initstate, initseq):
+//     Seed the rng.  Specified in two parts, state initializer and a
+//     sequence selection constant (a.k.a. stream id)
+
+void pcg32_srandom_r(pcg32_random_t *rng, uint64_t initstate,
+                     uint64_t initseq) {
+  rng->state = 0U;
+  rng->inc = (initseq << 1u) | 1u;
+  pcg32_random_r(rng);
+  rng->state += initstate;
+  pcg32_random_r(rng);
+}
+
+void pcg32_srandom(uint64_t seed, uint64_t seq) {
+  pcg32_srandom_r(&pcg32_global, seed, seq);
+}
+
+// pcg32_random()
+// pcg32_random_r(rng)
+//     Generate a uniformly distributed 32-bit random number
+
+uint32_t pcg32_random_r(pcg32_random_t *rng) {
+  uint64_t oldstate = rng->state;
+  rng->state = oldstate * 6364136223846793005ULL + rng->inc;
+  uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+  uint32_t rot = oldstate >> 59u;
+  return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
+
+uint32_t pcg32_random() { return pcg32_random_r(&pcg32_global); }
+
+// pcg32_boundedrand(bound):
+// pcg32_boundedrand_r(rng, bound):
+//     Generate a uniformly distributed number, r, where 0 <= r < bound
+
+uint32_t pcg32_boundedrand_r(pcg32_random_t *rng, uint32_t bound) {
+  // To avoid bias, we need to make the range of the RNG a multiple of
+  // bound, which we do by dropping output less than a threshold.
+  // A naive scheme to calculate the threshold would be to do
+  //
+  //     uint32_t threshold = 0x100000000ull % bound;
+  //
+  // but 64-bit div/mod is slower than 32-bit div/mod (especially on
+  // 32-bit platforms).  In essence, we do
+  //
+  //     uint32_t threshold = (0x100000000ull-bound) % bound;
+  //
+  // because this version will calculate the same modulus, but the LHS
+  // value is less than 2^32.
+
+  uint32_t threshold = -bound % bound;
+
+  // Uniformity guarantees that this loop will terminate.  In practice, it
+  // should usually terminate quickly; on average (assuming all bounds are
+  // equally likely), 82.25% of the time, we can expect it to require just
+  // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
+  // (i.e., 2147483649), which invalidates almost 50% of the range.  In
+  // practice, bounds are typically small and only a tiny amount of the range
+  // is eliminated.
+  for (;;) {
+    uint32_t r = pcg32_random_r(rng);
+    if (r >= threshold)
+      return r % bound;
+  }
+}
+
+uint32_t pcg32_boundedrand(uint32_t bound) {
+  return pcg32_boundedrand_r(&pcg32_global, bound);
+}
diff --git a/tests/sp_alloc.c b/tests/sp_alloc.c
new file mode 100644
index 0000000..1741e29
--- /dev/null
+++ b/tests/sp_alloc.c
@@ -0,0 +1,97 @@
+/*
+© (or copyright) 2022. Triad National Security, LLC. All rights reserved.
+This program was produced under U.S. Government contract 89233218CNA000001 for
+Los Alamos National Laboratory (LANL), which is operated by Triad National
+Security, LLC for the U.S. Department of Energy/National Nuclear Security
+Administration. All rights in the program are reserved by Triad National
+Security, LLC, and the U.S. Department of Energy/National Nuclear Security
+Administration. The Government is granted for itself and others acting on its
+behalf a nonexclusive, paid-up, irrevocable worldwide license in this material
+to reproduce, prepare derivative works, distribute copies to the public, perform
+publicly and display publicly, and to permit others to do so.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+Copyright (c) 2018, HPCGarage research group at Georgia Tech
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notices (both
+LANL and GT), this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of spatter nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+*/
+
+#include "sp_alloc.h"
+#include "parse-args.h" //error
+#include <stdio.h>
+#include <stdlib.h> //exit
+#include <string.h> //memset
+
+long long total_mem_used = 0;
+
+long long get_mem_used() { return total_mem_used; }
+void check_size(size_t size) {
+  total_mem_used += size;
+  // printf("size: %zu\n", size);
+  if (total_mem_used > SP_MAX_ALLOC) {
+    error("Too much memory used.", ERROR);
+  }
+}
+
+void check_safe_mult(size_t a, size_t b) {
+  int hi_bit_a = 0;
+  int hi_bit_b = 0;
+
+  while (a >>= 1)
+    hi_bit_a++;
+  while (b >>= 1)
+    hi_bit_b++;
+
+  if (hi_bit_a + hi_bit_b > sizeof(size_t) * 8) {
+    error("Error: Multiplication would overflow.", ERROR);
+  }
+}
+
+void *sp_malloc(size_t size, size_t count, size_t align) {
+  check_safe_mult(size, count);
+  check_size(size * count);
+#ifdef USE_POSIX_MEMALIGN
+  void *ptr = NULL;
+  int ret = posix_memalign(&ptr, align, size * count);
+  if (ret != 0)
+    ptr = NULL;
+#else
+  void *ptr = aligned_alloc(align, size * count);
+#endif
+  if (!ptr) {
+    printf("Attempted to allocate %zu bytes (%zu * %zu)\n", size * count, size,
+           count);
+    error("Error: failed to allocate memory", ERROR);
+  }
+  return ptr;
+}
+
+void *sp_calloc(size_t size, size_t count, size_t align) {
+  void *ptr = sp_malloc(size, count, align);
+  memset(ptr, 0, size * count);
+  return ptr;
+}
diff --git a/tests/test.c b/tests/test.c
new file mode 100644
index 0000000..30ceb13
--- /dev/null
+++ b/tests/test.c
@@ -0,0 +1,627 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "client.h"
+#include "client_cleanup.h"
+#include "client_init.h"
+#include "client_memory.h"
+#include "client_place_requests.h"
+#include "config.h"
+#include "kernels.h"
+#include "shm_malloc.h"
+
+// functions for measuring execution time of read_data and write_data (adapted
+// from https://stackoverflow.com/a/19898211)
+struct timespec start_timer() {
+  struct timespec start_time;
+  clock_gettime(CLOCK_MONOTONIC, &start_time);
+  return start_time;
+}
+
+// call this function to end a timer, returning nanoseconds elapsed as a long
+uint64_t stop_timer(struct timespec start_time) {
+  struct timespec end_time;
+  clock_gettime(CLOCK_MONOTONIC, &end_time);
+  uint64_t diffInNanos = (end_time.tv_sec - start_time.tv_sec) * (uint64_t)1e9 +
+                         (end_time.tv_nsec - start_time.tv_nsec);
+  return diffInNanos;
+}
+
+#define TIME(cmd, elapsed_ns)                                                  \
+  {                                                                            \
+    struct timespec start = start_timer();                                     \
+    cmd;                                                                       \
+    elapsed_ns += stop_timer(start);                                           \
+  }
+
+#define SINGLE_ALLOC
+
+#ifdef SINGLE_ALLOC
+double *res, *input, *buffer, *data;
+size_t *ind1, *ind2, *tmp;
+#endif
+
+#ifdef USE_CLIENT
+struct client client;
+int req_id = 0;
+#else
+// use regular malloc and free if we're not using the client model
+#define shm_malloc malloc
+#define shm_free free
+#endif
+
+// mock API to interact with memory accelerator
+double *read_data(const double *buffer, size_t N, const size_t *ind1,
+                  const size_t *ind2, uint64_t *elapsed_ns, size_t num_threads,
+                  bool use_avx) {
+// Only time loops, not memory allocation or flow control
+#ifndef SINGLE_ALLOC
+  double *res = (double *)shm_malloc(N * sizeof(double));
+#endif
+
+  memset(res, 0, N * sizeof(double));
+
+#ifdef USE_CLIENT
+
+  struct request read_req;
+
+  TIME(
+      {
+        scoria_read(&client, buffer, N, res, ind1, ind2, num_threads, use_avx,
+                    &read_req);
+        wait_request(&client, &read_req);
+      },
+      *elapsed_ns)
+
+#else
+
+  if (ind1 == NULL) {
+    assert(ind2 == NULL);
+    if (num_threads == 0) {
+      TIME(read_single_thread_0(res, buffer, N, use_avx), *elapsed_ns)
+    } else {
+      TIME(read_multi_thread_0(res, buffer, N, num_threads, use_avx),
+           *elapsed_ns)
+    }
+    return res;
+  }
+
+  if (ind2 == NULL) {
+    assert(ind1 != NULL);
+    if (num_threads == 0) {
+      TIME(read_single_thread_1(res, buffer, N, ind1, use_avx), *elapsed_ns)
+    } else {
+      TIME(read_multi_thread_1(res, buffer, N, ind1, num_threads, use_avx),
+           *elapsed_ns)
+    }
+    return res;
+  }
+
+  assert(ind1 != NULL);
+  assert(ind2 != NULL);
+
+  if (num_threads == 0) {
+    TIME(read_single_thread_2(res, buffer, N, ind1, ind2, use_avx), *elapsed_ns)
+  } else {
+    TIME(read_multi_thread_2(res, buffer, N, ind1, ind2, num_threads, use_avx),
+         *elapsed_ns)
+  }
+
+#endif
+
+  return res;
+}
+
+void write_data(double *buffer, size_t N, const double *input,
+                const size_t *ind1, const size_t *ind2, uint64_t *elapsed_ns,
+                size_t num_threads, bool use_avx) {
+#ifdef USE_CLIENT
+
+  struct request write_req;
+
+  TIME(
+      {
+        scoria_write(&client, buffer, N, input, ind1, ind2, num_threads,
+                     use_avx, &write_req);
+        wait_request(&client, &write_req);
+      },
+      *elapsed_ns)
+
+#else
+
+  // Only time loops, not memory allocation or flow control
+  if (ind1 == NULL) {
+    assert(ind2 == NULL);
+    if (num_threads == 0) {
+      TIME(write_single_thread_0(buffer, input, N, use_avx), *elapsed_ns)
+    } else {
+      TIME(write_multi_thread_0(buffer, input, N, num_threads, use_avx),
+           *elapsed_ns)
+    }
+    return;
+  }
+
+  if (ind2 == NULL) {
+    assert(ind1 != NULL);
+    if (num_threads == 0) {
+      TIME(write_single_thread_1(buffer, input, N, ind1, use_avx), *elapsed_ns)
+    } else {
+      TIME(write_multi_thread_1(buffer, input, N, ind1, num_threads, use_avx),
+           *elapsed_ns)
+    }
+    return;
+  }
+
+  assert(ind1 != NULL);
+  assert(ind2 != NULL);
+
+  if (num_threads == 0) {
+    TIME(write_single_thread_2(buffer, input, N, ind1, ind2, use_avx),
+         *elapsed_ns)
+  } else {
+    TIME(write_multi_thread_2(buffer, input, N, ind1, ind2, num_threads,
+                              use_avx),
+         *elapsed_ns)
+  }
+
+#endif
+}
+
+// lower and upper bounds are INCLUSIVE
+size_t irand(size_t lower, size_t upper) {
+  assert(upper >= lower);
+  // drand48 returns a double in the range[0,1.0) (inclusive 0, exclusive 1), so
+  // multiply that by (upper - lower + 1) so that after truncating, we get an
+  // integer in the range [lower, upper] (both bounds inclusive)
+  size_t res = lower + (size_t)(drand48() * (double)(upper - lower + 1));
+  assert((res >= lower) && (res <= upper));
+  return res;
+}
+
+#ifdef SINGLE_ALLOC
+// don't do anything
+#define RES_FREE
+
+#define INPUT_MALLOC
+#define INPUT_FREE
+
+#define TMP_MALLOC
+#define TMP_FREE
+
+#else
+// do malloc and free
+#define RES_FREE shm_free(res);
+
+#define INPUT_MALLOC double *input = (double *)shm_malloc(N * sizeof(double));
+#define INPUT_FREE shm_free(input);
+
+// this doesn't need to be in shared memory
+#define TMP_MALLOC size_t *tmp = (size_t *)malloc(4 * N * sizeof(size_t));
+#define TMP_FREE free(tmp);
+
+#endif
+
+// return values:
+// 0: read and write passed
+// 1: read failed
+// 2: write failed
+#define CHECK_IMPL(ind1, ind2, IDX)                                            \
+  double *res =                                                                \
+      read_data(data, N, ind1, ind2, time_read, num_threads, use_avx);         \
+  for (size_t i = 0; i < N; ++i) {                                             \
+    if (res[i] != data[IDX]) {                                                 \
+      return 1;                                                                \
+    }                                                                          \
+  }                                                                            \
+  RES_FREE;                                                                    \
+                                                                               \
+  INPUT_MALLOC;                                                                \
+  memset(input, 0, N * sizeof(double));                                        \
+                                                                               \
+  /* since we could have aliases, we can't compare data to input, since */     \
+  /* some inputs could be overwritten by other inputs, and we also don't */    \
+  /* have a guaranteed order in which aliased entries are written, so for */   \
+  /* an aliased index, there are multiple valid values. First get alias */     \
+  /* count and then make a list of valid values for each index. */             \
+  TMP_MALLOC;                                                                  \
+  memset(tmp, 0, 4 * N * sizeof(size_t));                                      \
+  size_t *alias_cnt = tmp + 0 * N;                                             \
+  size_t *start_idx = tmp + 1 * N;                                             \
+  size_t *curr_idx = tmp + 2 * N;                                              \
+  size_t *src_idxs = tmp + 3 * N;                                              \
+                                                                               \
+  for (size_t i = 0; i < N; ++i) {                                             \
+    input[i] = (double)i;                                                      \
+    alias_cnt[IDX] += 1;                                                       \
+  }                                                                            \
+  write_data(data, N, input, ind1, ind2, time_write, num_threads, use_avx);    \
+                                                                               \
+  start_idx[0] = 0;                                                            \
+  for (size_t i = 1; i < N; ++i) {                                             \
+    start_idx[i] = start_idx[i - 1] + alias_cnt[i - 1];                        \
+  }                                                                            \
+  for (size_t i = 0; i < N; ++i) {                                             \
+    size_t idx = IDX;                                                          \
+    src_idxs[start_idx[idx] + curr_idx[idx]] = i;                              \
+    curr_idx[idx] += 1;                                                        \
+  }                                                                            \
+                                                                               \
+  int ret = 0;                                                                 \
+  for (size_t i = 0; i < N; ++i) {                                             \
+    /* check that the value in data matches one of the possible input */       \
+    /* values */                                                               \
+    size_t cnt = alias_cnt[i];                                                 \
+    if (cnt == 0) {                                                            \
+      continue;                                                                \
+    }                                                                          \
+                                                                               \
+    bool match_found = false;                                                  \
+    for (size_t j = start_idx[i]; j < start_idx[i] + cnt; ++j) {               \
+      if (data[i] == input[src_idxs[j]]) {                                     \
+        match_found = true;                                                    \
+        break;                                                                 \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    if (!match_found) {                                                        \
+      ret = 2;                                                                 \
+      break;                                                                   \
+    }                                                                          \
+  }                                                                            \
+                                                                               \
+  INPUT_FREE;                                                                  \
+  TMP_FREE;                                                                    \
+  return ret;
+
+int check_0_level(double *data, size_t N, uint64_t *time_read,
+                  uint64_t *time_write, size_t num_threads, bool use_avx) {
+  CHECK_IMPL(NULL, NULL, i)
+}
+
+int check_1_level(double *data, size_t N, const size_t *ind,
+                  uint64_t *time_read, uint64_t *time_write, size_t num_threads,
+                  bool use_avx) {
+  CHECK_IMPL(ind, NULL, ind[i])
+}
+
+int check_2_level(double *data, size_t N, const size_t *ind1,
+                  const size_t *ind2, uint64_t *time_read, uint64_t *time_write,
+                  size_t num_threads, bool use_avx) {
+  CHECK_IMPL(ind1, ind2, ind2[ind1[i]])
+}
+
+// shuffle in-place the given indices from indices[0] to indices[N-1]
+// using the modern Fisher-Yates shuffle (see
+// https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle and
+// https://stackoverflow.com/a/10072899)
+void shuffle(size_t *indices, size_t N) {
+  if (N <= 1) {
+    return;
+  }
+
+  for (size_t i = 0; i < N - 1; ++i) {
+    size_t last_idx = N - 1 - i;
+    size_t index = irand(0, last_idx);
+    size_t temp = indices[index];
+    indices[index] = indices[last_idx];
+    indices[last_idx] = temp;
+  }
+}
+
+void clustered_shuffle(size_t *indices, size_t N, size_t cluster_size) {
+  size_t num_clusters = (N + cluster_size - 1) / cluster_size; // round up
+  for (size_t c = 0; c < num_clusters; ++c) {
+    size_t cluster_start = c * cluster_size;
+    size_t cluster_end = cluster_start + cluster_size;
+    size_t this_cluster_size =
+        cluster_end < N ? cluster_size : N - cluster_start;
+    shuffle(indices + cluster_start, this_cluster_size);
+  }
+}
+
+void add_aliases(size_t *indices, size_t N, double alias_fraction) {
+  for (size_t i = 0; i < N; ++i) {
+    if (drand48() < alias_fraction) {
+      // this index will be aliased, insert it at a random location
+      size_t idx = irand(0, N - 1);
+      indices[idx] = indices[i];
+    }
+  }
+}
+
+void add_clustered_aliases(size_t *indices, size_t N, double alias_fraction,
+                           size_t cluster_size) {
+  size_t num_clusters = (N + cluster_size - 1) / cluster_size; // round up
+  for (size_t c = 0; c < num_clusters; ++c) {
+    size_t cluster_start = c * cluster_size;
+    size_t cluster_end = cluster_start + cluster_size;
+    size_t this_cluster_size =
+        cluster_end < N ? cluster_size : N - cluster_start;
+    add_aliases(indices + cluster_start, this_cluster_size, alias_fraction);
+  }
+}
+
+bool report(const char *name, int result) {
+  (void)name;
+  // printf("%30s: %s\n", name,
+  //        result == 0
+  //            ? "pass"
+  //            : result == 1 ? "FAIL read"
+  //                          : result == 2 ? "FAIL write" : "FAIL unknown");
+
+  return (result == 0);
+}
+
+void reset(double *data, size_t *ind1, size_t *ind2, size_t N) {
+  for (size_t i = 0; i < N; ++i) {
+    data[i] = (double)i;
+    ind1[i] = i;
+    ind2[i] = i;
+  }
+}
+
+#define NUM_TESTS 11
+
+bool run_test_suite(size_t N, size_t cluster_size, double alias_fraction,
+                    size_t num_threads, bool use_avx, uint64_t *time_read,
+                    uint64_t *time_write) {
+  // initialize random number generator, use a specific seed to make every run
+  // of the test suite use the same indirection
+  srand48(42);
+
+  // things to test:
+  // - no indirection
+  // - 1 level of indirection
+  //   . straight access
+  //   . permutation (1-to-1), random and clustered
+  //   . with aliases, random and clustered
+  // - 2 levels of indirection, same as above
+
+#ifndef SINGLE_ALLOC
+  double *data = (double *)shm_malloc(N * sizeof(double));
+  size_t *ind1 = (size_t *)shm_malloc(N * sizeof(size_t));
+  size_t *ind2 = (size_t *)shm_malloc(N * sizeof(size_t));
+#endif
+
+  bool all_pass = true;
+
+  // No indirection
+  reset(data, ind1, ind2, N);
+  all_pass &= report("No indirection",
+                     check_0_level(data, N, time_read + 0, time_write + 0,
+                                   num_threads, use_avx));
+
+  // 1 level of indirection
+
+  // straight access
+  reset(data, ind1, ind2, N);
+  all_pass &= report("1-lev straight",
+                     check_1_level(data, N, ind1, time_read + 1, time_write + 1,
+                                   num_threads, use_avx));
+
+  // permutation (no aliases)
+  reset(data, ind1, ind2, N);
+  shuffle(ind1, N);
+  all_pass &= report("1-lev full shuffle no alias",
+                     check_1_level(data, N, ind1, time_read + 2, time_write + 2,
+                                   num_threads, use_avx));
+
+  reset(data, ind1, ind2, N);
+  clustered_shuffle(ind1, N, cluster_size);
+  all_pass &= report("1-lev clustered no alias",
+                     check_1_level(data, N, ind1, time_read + 3, time_write + 3,
+                                   num_threads, use_avx));
+
+  // with aliases
+  reset(data, ind1, ind2, N);
+  add_aliases(ind1, N, alias_fraction);
+  shuffle(ind1, N);
+  all_pass &= report("1-lev full shuffle with alias",
+                     check_1_level(data, N, ind1, time_read + 4, time_write + 4,
+                                   num_threads, use_avx));
+
+  reset(data, ind1, ind2, N);
+  add_clustered_aliases(ind1, N, alias_fraction, cluster_size);
+  clustered_shuffle(ind1, N, cluster_size);
+  all_pass &= report("1-lev clustered with alias",
+                     check_1_level(data, N, ind1, time_read + 5, time_write + 5,
+                                   num_threads, use_avx));
+
+  // 2 level of indirection
+
+  // straight access
+  reset(data, ind1, ind2, N);
+  all_pass &= report("2-lev straight",
+                     check_2_level(data, N, ind1, ind2, time_read + 6,
+                                   time_write + 6, num_threads, use_avx));
+
+  // permutation (no aliases)
+  reset(data, ind1, ind2, N);
+  shuffle(ind1, N);
+  shuffle(ind2, N);
+  all_pass &= report("2-lev full shuffle no alias",
+                     check_2_level(data, N, ind1, ind2, time_read + 7,
+                                   time_write + 7, num_threads, use_avx));
+
+  reset(data, ind1, ind2, N);
+  clustered_shuffle(ind1, N, cluster_size);
+  clustered_shuffle(ind2, N, cluster_size);
+  all_pass &= report("2-lev clustered no alias",
+                     check_2_level(data, N, ind1, ind2, time_read + 8,
+                                   time_write + 8, num_threads, use_avx));
+
+  // with aliases
+  reset(data, ind1, ind2, N);
+  add_aliases(ind1, N, alias_fraction);
+  add_aliases(ind2, N, alias_fraction);
+  shuffle(ind1, N);
+  shuffle(ind2, N);
+  all_pass &= report("2-lev full shuffle with alias",
+                     check_2_level(data, N, ind1, ind2, time_read + 9,
+                                   time_write + 9, num_threads, use_avx));
+
+  reset(data, ind1, ind2, N);
+  add_clustered_aliases(ind1, N, alias_fraction, cluster_size);
+  add_clustered_aliases(ind2, N, alias_fraction, cluster_size);
+  clustered_shuffle(ind1, N, cluster_size);
+  clustered_shuffle(ind2, N, cluster_size);
+  all_pass &= report("2-lev clustered with alias",
+                     check_2_level(data, N, ind1, ind2, time_read + 10,
+                                   time_write + 10, num_threads, use_avx));
+
+#ifndef SINGLE_ALLOC
+  shm_free(data);
+  shm_free(ind1);
+  shm_free(ind2);
+#endif
+
+  return all_pass;
+}
+
+void benchmark(size_t N, size_t cluster_size, double alias_fraction,
+               size_t num_threads, bool use_avx) {
+  size_t num_runs = 5;
+  size_t ignore_first_num = 1;
+
+  bool all_pass = true;
+  uint64_t time_read[NUM_TESTS], time_read_sum[NUM_TESTS];
+  uint64_t time_write[NUM_TESTS], time_write_sum[NUM_TESTS];
+
+  for (size_t j = 0; j < NUM_TESTS; ++j) {
+    time_read[j] = 0;
+    time_write[j] = 0;
+    time_read_sum[j] = 0;
+    time_write_sum[j] = 0;
+  }
+
+  for (size_t i = 0; i < num_runs; ++i) {
+    for (size_t j = 0; j < NUM_TESTS; ++j) {
+      time_read[j] = 0;
+      time_write[j] = 0;
+    }
+
+    all_pass &= run_test_suite(N, cluster_size, alias_fraction, num_threads,
+                               use_avx, time_read, time_write);
+
+    if (i >= ignore_first_num) {
+      for (size_t j = 0; j < NUM_TESTS; ++j) {
+        time_read_sum[j] += time_read[j];
+        time_write_sum[j] += time_write[j];
+      }
+    }
+  }
+
+  printf("%8zu  ", num_threads);
+  // we want to compute bandwidth in MiB/s, multiply data by number of tests
+  // timed
+  double bw_mult = (double)(N * sizeof(double) * (num_runs - ignore_first_num));
+  // now divide to get GiB and multiply by 1e9 because time is in ns
+  bw_mult *= 1e9 / (1024.0 * 1024.0 * 1024.0);
+
+  uint64_t total_read = 0;
+  uint64_t total_write = 0;
+  for (size_t j = 0; j < NUM_TESTS; ++j) {
+    total_read += time_read_sum[j];
+    total_write += time_write_sum[j];
+    printf("%4.1f | %4.1f  ", bw_mult / (double)time_read_sum[j],
+           bw_mult / (double)time_write_sum[j]);
+  }
+  printf("%4.1f | %4.1f  %s\n", NUM_TESTS * bw_mult / (double)total_read,
+         NUM_TESTS * bw_mult / (double)total_write,
+         all_pass ? "all pass" : "some FAILED");
+}
+
+void run_benchmarks(size_t N, size_t cluster_size, double alias_fraction,
+                    bool use_avx) {
+  printf("\nRunning tests %s AVX intrinsics\n", use_avx ? "with" : "WITHOUT");
+  const char *names[NUM_TESTS] = {"0-str",  "1-str", "1-FnoA", "1-CnoA",
+                                  "1-FA",   "1-CA",  "2-str",  "2-FnoA",
+                                  "2-CnoA", "2-FA",  "2-CA"};
+  printf("%8s  ", "Threads");
+  for (size_t j = 0; j < NUM_TESTS; ++j) {
+    printf("%11s  ", names[j]);
+  }
+  printf("%11s\n", "Total");
+
+  benchmark(N, cluster_size, alias_fraction, 0, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 1, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 2, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 4, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 8, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 16, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 24, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 32, use_avx);
+  benchmark(N, cluster_size, alias_fraction, 48, use_avx);
+}
+
+int main(int argc, char **argv) {
+  // Suppress Compiler Warnings
+  (void)argc;
+  (void)argv;
+
+  size_t N = 1024 * 1024;
+  size_t cluster_size = 32;
+  double alias_fraction = 0.1;
+
+#ifdef USE_CLIENT
+  printf("Running tests using the memory controller, which must be started "
+         "before this executable is run\n");
+  client.chatty = 0;
+  init(&client);
+#else
+  printf("Running tests WITHOUT using the memory controller\n");
+#endif
+
+#ifdef SINGLE_ALLOC
+  data = (double *)shm_malloc(N * sizeof(double));
+  res = (double *)shm_malloc(N * sizeof(double));
+  buffer = (double *)shm_malloc(N * sizeof(double));
+  input = (double *)shm_malloc(N * sizeof(double));
+  ind1 = (size_t *)shm_malloc(N * sizeof(size_t));
+  ind2 = (size_t *)shm_malloc(N * sizeof(size_t));
+
+  // doesn't have to be shared memory
+  tmp = (size_t *)malloc(4 * N * sizeof(size_t));
+#endif
+
+  printf(
+      "Benchmark results (average read | write bandwidth in GiB/s), N = %zu\n",
+      N);
+  printf(" 0|1|2: number of levels of indirection\n");
+  printf("   str: straight access\n");
+  printf("   F|C: full or clustered shuffle\n");
+  printf(" A|noA: with or without aliases\n\n");
+
+  run_benchmarks(N, cluster_size, alias_fraction, false);
+  run_benchmarks(N, cluster_size, alias_fraction, true);
+
+#ifdef USE_CLIENT
+  // send quit request
+  // struct request quit_req;
+  // quit_req.client = client_id;
+  // quit_req.r_type = Quit;
+  // quit_req.id = ++req_id;
+  // scoria_put_request(&client, &quit_req);
+  // wait_request(&client, &quit_req);
+
+  cleanup(&client);
+#endif
+
+#ifdef SINGLE_ALLOC
+  shm_free(data);
+  shm_free(res);
+  shm_free(buffer);
+  shm_free(input);
+  shm_free(ind1);
+  shm_free(ind2);
+
+  free(tmp);
+#endif
+
+  return 0;
+}
diff --git a/tests/test_spatter.c b/tests/test_spatter.c
new file mode 100644
index 0000000..a8cd1f3
--- /dev/null
+++ b/tests/test_spatter.c
@@ -0,0 +1,180 @@
+#include <stdio.h>
+
+#include "client.h"
+#include "client_cleanup.h"
+#include "client_init.h"
+#include "client_memory.h"
+#include "client_place_requests.h"
+#include "config.h"
+#include "parse-args.h"
+#include "shm_malloc.h"
+
+#define SP_MAX_ALLOC (65 * 1000 * 1000 * 1000)
+
+size_t remap_pattern(const int nrc, unsigned long *pattern,
+                     const size_t pattern_len);
+
+int main(int argc, char **argv) {
+  struct run_config *rc;
+  int nrc = 0;
+
+  parse_args(argc, argv, &nrc, &rc);
+
+  if (nrc <= 0) {
+    printf("No run configurations parsed\n");
+    return 1;
+  }
+
+  if (rc[0].kernel != GATHER && rc[0].kernel != SCATTER && rc[0].kernel != GS) {
+    printf("Error: Unsupported kernel\n");
+    exit(1);
+  }
+
+  size_t max_pattern_val = 0;
+
+  for (int i = 0; i < nrc; i++)
+    max_pattern_val = remap_pattern(nrc, rc[i].pattern, rc[i].pattern_len);
+
+  printf("max pattern val: %d\n", max_pattern_val);
+
+  struct client client;
+  client.chatty = 1;
+  init(&client);
+
+  for (int i = 0; i < nrc; i++) {
+    for (int j = -1; j <= (int)rc[i].nruns; j++) {
+      size_t N = rc[i].pattern_len;
+
+      double *res = (double *)shm_malloc(max_pattern_val * sizeof(double));
+      double *input = (double *)shm_malloc(max_pattern_val * sizeof(double));
+
+      size_t *pattern = shm_malloc(N * sizeof(size_t));
+
+      memset(res, 0, max_pattern_val * sizeof(double));
+
+      for (size_t k = 0; k < max_pattern_val; k++)
+        input[k] = (double)(2 * k);
+
+      for (size_t k = 0; k < N; k++)
+        pattern[k] = (size_t)rc[i].pattern[k];
+
+      switch (rc[i].kernel) {
+      case SCATTER:
+        printf("Scatter with Length: %d\n", N);
+
+        struct request write_req;
+        scoria_write(&client, res, N, input, pattern, NULL, 0, 0, &write_req);
+        wait_request(&client, &write_req);
+
+        break;
+      case GATHER:
+        printf("Gather with Length: %d\n", N);
+
+        struct request read_req;
+        scoria_read(&client, input, N, res, pattern, NULL, 0, 0, &read_req);
+        wait_request(&client, &read_req);
+
+        break;
+      default:
+        printf("Error: Unable to determine kernel\n");
+        break;
+      }
+
+      shm_free(res);
+      shm_free(input);
+      shm_free(pattern);
+    }
+  }
+
+  cleanup(&client);
+
+  return 0;
+}
+
+size_t remap_pattern(const int nrc, unsigned long *pattern,
+                     const size_t pattern_len) {
+  size_t max_pattern_val = pattern[0];
+
+  for (size_t j = 0; j < pattern_len; j++) {
+    if (pattern[j] > max_pattern_val) {
+      max_pattern_val = pattern[j];
+    }
+  }
+
+  // Post-Processing to make heap values fit
+  size_t boundary = (((SP_MAX_ALLOC - 1) / sizeof(sgData_t)) / nrc) / 2;
+  // printf("Boundary: %zu, max_pattern_val: %zu, difference: %zu\n", boundary,
+  // max_pattern_val, max_pattern_val - boundary);
+
+  if (max_pattern_val >= boundary) {
+    // printf("Inside of boundary if statement\n");
+    int outside_boundary = 0;
+    for (size_t j = 0; j < pattern_len; j++) {
+      if (pattern[j] >= boundary) {
+        outside_boundary++;
+      }
+    }
+
+    // printf("Configuration: %d, Number of indices outside of boundary: %d,
+    // Total pattern length: %d, Frequency of outside of boundary indices:
+    // %.2f\n", i, outside_boundary, rc[i].pattern_len, (double)outside_boundary
+    // / (double)rc[i].pattern_len );
+
+    // Initialize map to sentinel value of -1
+    size_t heap_map[outside_boundary][2];
+    for (size_t j = 0; j < outside_boundary; j++) {
+      heap_map[j][0] = -1;
+      heap_map[j][1] = -1;
+    }
+
+    int pos = 0;
+    for (size_t j = 0; j < pattern_len; j++) {
+      if (pattern[j] >= boundary) {
+        // Search if exists in map
+        int found = 0;
+        for (size_t k = 0; k < pos; k++) {
+          if (heap_map[k][0] == pattern[j]) {
+            // printf("Already found %zu at position %zu\n", rc[i].pattern[j],
+            // k);
+            found = 1;
+            break;
+          }
+        }
+
+        // If not found, add to map
+        if (!found) {
+          // printf("Inserting %zu, %zu into the heap map at position %zu\n",
+          // rc[i].pattern[j], boundary - pos, pos);
+          heap_map[pos][0] = pattern[j];
+          heap_map[pos][1] = boundary - pos;
+          pos++;
+        }
+      }
+    }
+
+    for (size_t j = 0; j < pattern_len; j++) {
+      if (pattern[j] >= boundary) {
+        // Find entry in map
+        int idx = -1;
+        for (size_t k = 0; k < pos; k++) {
+          if (heap_map[k][0] == pattern[j]) {
+            // printf("Found at index: %d\n", k);
+
+            // Map heap address to new address inside of boundary
+            pattern[j] = heap_map[k][1];
+            break;
+          }
+        }
+      }
+    }
+
+    max_pattern_val = pattern[0];
+    for (size_t j = 0; j < pattern_len; j++) {
+      if (pattern[j] > max_pattern_val) {
+        max_pattern_val = pattern[j];
+      }
+    }
+  }
+
+  return max_pattern_val;
+}