diff --git a/.gitignore b/.gitignore index c6127b3..fd12ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,18 @@ +# Custom +*~ +build*/ +debug*/ +scripts/ +compile_commands.json +.clangd/ +shm_malloc/*.o +shm_malloc/*.a +shm_malloc/tanon +shm_malloc/tshm1 +shm_malloc/tshm1db +shm_malloc/tshm2 +shm_malloc/tshm2db + # Prerequisites *.d diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..cbcc0ca --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "external/argtable3"] + path = external/argtable3 + url = https://github.com/argtable/argtable3.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..a622983 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,55 @@ +cmake_minimum_required(VERSION 3.10) +project(memory-accelerator C) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_subdirectory(src) + +add_compile_options(-Wall -Wextra -pedantic -Wno-unused-function -Wno-strict-prototypes -g) + +# JL: Can't use STREQUAL here because the compiler ID could be "IntelLLVM" +if (CMAKE_C_COMPILER_ID MATCHES "Intel") + add_compile_options(-mavx512f -Wno-debug-disables-optimization) +else() + add_compile_options(-g) +endif() + +set(CMAKE_C_STANDARD 11) + +include_directories(include) +include_directories(include/controller) +include_directories(include/client) +include_directories(include/posix) +include_directories(include/shared) +include_directories(include/uthash) + +add_executable(client client.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES}) +target_link_libraries(client shm pthread rt) + +add_executable(controller controller.c ${CONTROLLER_SOURCE_FILES} ${SHARED_SOURCE_FILES}) +target_link_libraries(controller shm pthread rt) + + +find_package(Git QUIET) +if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") +# Update submodules as needed + option(GIT_SUBMODULE "Check submodules during build" ON) + if(GIT_SUBMODULE) + message(STATUS "Submodule update") + execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE GIT_SUBMOD_RESULT) + if(NOT GIT_SUBMOD_RESULT EQUAL "0") + message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules") + endif() + endif() +endif() + +if(NOT EXISTS "${PROJECT_SOURCE_DIR}/external/argtable3/CMakeLists.txt") + message(FATAL_ERROR "The submodules were not downloaded! GIT_SUBMODULE was turned off or failed. Please update submodules and try again.") +endif() + +add_subdirectory(external/argtable3) +include_directories(external/argtable3/src) + +add_subdirectory(tests) diff --git a/README.md b/README.md new file mode 100644 index 0000000..84944d8 --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ +# ScorIA + +## Description +Prototype testbed for various memory acceleration schemes focused on improving sparse memory accesses and scatter/gather performance through indirection arrays. + +## Installation +``` +mkdir build +cd build +cmake .. +make +``` + +## Usage +Terminal window 1 +``` +./controller +``` + +Terminal window 2 +``` +./tests/test_client +``` + +## Roadmap +Current Work: +- [x] Request Ring Buffer +- [x] Controller Request Handler +- [x] Develop Initial Test Cases +- [x] Implement Read, Write +- [x] Asynchronous Requests +- [x] Atomic, Serialized, and Parallel Writes (Aliasing) +- [x] Multi-threading +- [x] Multi-client +- [x] Integration with Spatter + +Future: +- [ ] AVX intrinsics +- [ ] SVE intrinsics +- [ ] Read/Write Dependency Graphs +- [ ] Run and Test scripts + +Initial Requirements: +- [x] Multi-Process (Separation between client and server) +- [x] Server is the memory controller +- [x] Memory controller needs to be multi-threaded +- [x] Needs to be able to use vector load store (for optimized bandwidth) +- [x] Needs an API for taking memory access requests +- [x] Needs to run on CPU hardware +- [ ] Needs to be numa aware +- [ ] Start prototyping on SPR (DDR) +- [x] Define some test program for driving this + - [x] allocaate A[], B[], C[] + - [x] fetch (A[B[C[i]]] for i = 0:1000 + - [x] D[1000] +- [x] Determine what we will use for programming language (C) + - [x] FPGA friendly +- [ ] Modular SGDMA +- [x] Determine the shared memory programming paradigm +- [x] Specialized memory allocator + - [x] Everything in shared memory + - [x] A, B, and C need to be allocated in shared memory (shared between the two process spaces) + - [x] These should probably be allocated at the same virtual address locations +- [ ] Synchronous vs. Asynchronous + - [x] MPI_IReceive and MPI_Wait as inspiration + - [x] Handles of some sort + - [x] Could also just be an update of the latest fetched index (in a prefetching workload) in a well-known mailbox + +## Authors and acknowledgment +- Jered Dominguez-Trujillo jereddt@lanl.gov +- Jonas Lippuner jlippuner@lanl.gov diff --git a/client.c b/client.c new file mode 100644 index 0000000..57bf6b2 --- /dev/null +++ b/client.c @@ -0,0 +1,26 @@ +#include + +#include "client.h" +#include "config.h" + +#include "client_cleanup.h" +#include "client_init.h" +#include "client_place_requests.h" + +int main(int argc, char **argv) { + // Suppress Compiler Warnings + (void)argc; + (void)argv; + + struct client client; + client.chatty = 0; + + init(&client); + place_requests(&client); + + cleanup(&client); + + printf("Exiting...\n"); + + return 0; +} diff --git a/controller.c b/controller.c new file mode 100644 index 0000000..a706d06 --- /dev/null +++ b/controller.c @@ -0,0 +1,26 @@ +#include + +#include "config.h" +#include "controller.h" + +#include "controller_cleanup.h" +#include "controller_handle_requests.h" +#include "controller_init.h" + +int main(int argc, char **argv) { + // Suppress Compiler Warnings + (void)argc; + (void)argv; + + struct controller controller; + controller.chatty = 1; + + init(&controller); + handle_requests(&controller); + + cleanup(&controller); + + printf("Exiting...\n"); + + return 0; +} diff --git a/external/argtable3 b/external/argtable3 new file mode 160000 index 0000000..f46ef20 --- /dev/null +++ b/external/argtable3 @@ -0,0 +1 @@ +Subproject commit f46ef208a7ebc941c9f56027112b6894baa4c3f9 diff --git a/include/client.h b/include/client.h new file mode 100644 index 0000000..2a64bac --- /dev/null +++ b/include/client.h @@ -0,0 +1,30 @@ +#ifndef CLIENT_H +#define CLIENT_H + +#include + +#include "config.h" +#include "request.h" + +struct client { + int id; + + int fd_location; + int fd_requests; + int fd_completions; + + int chatty; + + struct memory_location *shared_location; + + struct request_queue_list *shared_requests_list; + struct request_queue_list *shared_completions_list; + struct shared_memory *shared_mem_ptr; + + struct request_queue *shared_requests; + struct request_queue *shared_completions; + + struct request *unmatched_requests; +}; + +#endif /* CLIENT_H */ diff --git a/include/client/client_cleanup.h b/include/client/client_cleanup.h new file mode 100644 index 0000000..ac57abb --- /dev/null +++ b/include/client/client_cleanup.h @@ -0,0 +1,10 @@ +#ifndef CLIENT_CLEANUP +#define CLIENT_CLEANUP + +#include "client.h" + +void cleanup_queues(struct client *client); +void cleanup_shared_mem(struct client *client); +void cleanup(struct client *client); + +#endif /* CLIENT_CLEANUP */ diff --git a/include/client/client_init.h b/include/client/client_init.h new file mode 100644 index 0000000..d67ca6b --- /dev/null +++ b/include/client/client_init.h @@ -0,0 +1,13 @@ +#ifndef CLIENT_INIT +#define CLIENT_INIT + +#include "client.h" +#include "request.h" + +void init_memory_pool(struct client *client); +void init_requests(struct client *client); +void init_completions(struct client *client); +void init_id(struct client *client); +void init(struct client *client); + +#endif /* CLIENT_INIT */ diff --git a/include/client/client_memory.h b/include/client/client_memory.h new file mode 100644 index 0000000..898863b --- /dev/null +++ b/include/client/client_memory.h @@ -0,0 +1,19 @@ +#ifndef CLIENT_MEMORY_H +#define CLIENT_MEMORY_H + +#include + +#include "client.h" +#include "request.h" + +void scoria_put_request(struct client *client, struct request *req); + +void scoria_quit(struct client *client, struct request *req); +void scoria_read(struct client *client, void *buffer, const size_t N, + void *output, const size_t *ind1, const size_t *ind2, + size_t num_threads, bool use_avx, struct request *req); +void scoria_write(struct client *client, void *buffer, const size_t N, + void *input, const size_t *ind1, const size_t *ind2, + size_t num_threads, bool use_avx, struct request *req); + +#endif /* CLIENT_MEMORY_H */ diff --git a/include/client/client_place_requests.h b/include/client/client_place_requests.h new file mode 100644 index 0000000..1804b1f --- /dev/null +++ b/include/client/client_place_requests.h @@ -0,0 +1,12 @@ +#ifndef CLIENT_PLACE_REQUESTS +#define CLIENT_PLACE_REQUESTS + +#include "client.h" +#include "request.h" + +void wait_request(struct client *client, struct request *req); +void wait_requests(struct client *client, struct request *reqs, + size_t num_reqs); +void place_requests(struct client *client); + +#endif /* CLIENT_PLACE_REQUESTS */ diff --git a/include/client/client_read_location.h b/include/client/client_read_location.h new file mode 100644 index 0000000..ce7c943 --- /dev/null +++ b/include/client/client_read_location.h @@ -0,0 +1,8 @@ +#ifndef CLIENT_READ_LOCATION +#define CLIENT_READ_LOCATION + +#include "client.h" + +void read_location(struct client *client); + +#endif /* CLIENT_READ_LOCATION */ diff --git a/include/config.h b/include/config.h new file mode 100644 index 0000000..0f1a67f --- /dev/null +++ b/include/config.h @@ -0,0 +1,30 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#define SHARED_MEMORY_NAME "shared-mem" +#define SHARED_LOCATION_NAME "/mem-controller-location" +#define SHARED_REQUESTS_NAME "/mem-request-queue" +#define SHARED_COMPLETIONS_NAME "/mem-completion-queue" + +#define REQUEST_QUEUE_SIZE 100 +#define MAX_CLIENTS 64 + +struct list { + struct list *next; + int data; +}; + +struct shared_memory { + struct list *head; + struct list **tail; +}; + +struct memory_location { + int ready; + + struct shared_memory *shared_mem_ptr; + struct request_queue_list *shared_requests_list; + struct request_queue_list *shared_completions_list; +}; + +#endif /* CONFIG_H */ diff --git a/include/controller.h b/include/controller.h new file mode 100644 index 0000000..ada4ccc --- /dev/null +++ b/include/controller.h @@ -0,0 +1,23 @@ +#ifndef CONTROLLER_H +#define CONTROLLER_H + +#include + +#include "config.h" +#include "request.h" + +struct controller { + int fd_location; + int fd_requests; + int fd_completions; + + int chatty; + + struct memory_location *shared_location; + + struct request_queue_list *shared_requests_list; + struct request_queue_list *shared_completions_list; + struct shared_memory *shared_mem_ptr; +}; + +#endif /* CONTROLLER_H */ diff --git a/include/controller/controller_cleanup.h b/include/controller/controller_cleanup.h new file mode 100644 index 0000000..b98eb61 --- /dev/null +++ b/include/controller/controller_cleanup.h @@ -0,0 +1,9 @@ +#ifndef CONTROLLER_CLEANUP +#define CONTROLLER_CLEANUP + +#include "controller.h" + +void cleanup_shared_mem(struct controller *controller); +void cleanup(struct controller *controller); + +#endif /* CONTROLLER_CLEANUP */ diff --git a/include/controller/controller_handle_requests.h b/include/controller/controller_handle_requests.h new file mode 100644 index 0000000..5cc82ad --- /dev/null +++ b/include/controller/controller_handle_requests.h @@ -0,0 +1,20 @@ +#ifndef CONTROLLER_HANDLE_REQUESTS +#define CONTROLLER_HANDLE_REQUESTS + +#include "controller.h" +#include "request.h" + +struct thread_args { + size_t i; + struct controller *controller; +}; + +void handle_read(struct controller *controller, struct request_queue *queue, + struct request *req); +void handle_write(struct controller *controller, struct request_queue *queue, + struct request *req); + +void *handler(void *args); +void handle_requests(struct controller *controller); + +#endif /* CONTROLLER_HANDLE_REQUESTS */ diff --git a/include/controller/controller_init.h b/include/controller/controller_init.h new file mode 100644 index 0000000..9776f01 --- /dev/null +++ b/include/controller/controller_init.h @@ -0,0 +1,13 @@ +#ifndef CONTROLLER_INIT +#define CONTROLLER_INIT + +#include "controller.h" +#include "request.h" + +void init_files(); +void init_memory_pool(struct controller *controller); +void init_requests(struct controller *controller); +void init_completions(struct controller *controller); +void init(struct controller *controller); + +#endif /* CONTROLLER_INIT */ diff --git a/include/controller/controller_write_location.h b/include/controller/controller_write_location.h new file mode 100644 index 0000000..cb237a3 --- /dev/null +++ b/include/controller/controller_write_location.h @@ -0,0 +1,8 @@ +#ifndef CONTROLLER_WRITE_LOCATION +#define CONTROLLER_WRITE_LOCATION + +#include "controller.h" + +void write_location(struct controller *controller); + +#endif /* CONTROLLER_WRITE_LOCATION */ diff --git a/include/posix/posix_sm.h b/include/posix/posix_sm.h new file mode 100644 index 0000000..cde60fb --- /dev/null +++ b/include/posix/posix_sm.h @@ -0,0 +1,19 @@ +#ifndef POSIX_SM_H +#define POSIX_SM_H + +#include +#include + +int scoria_sm_open(const char *name, int oflag, mode_t mode, const char *msg); + +void scoria_sm_unlink(const char *name, const char *msg); + +void scoria_sm_truncate(const int fd, const size_t length, const char *msg); + +void *scoria_sm_map(void *addr, const size_t length, const int prot, + const int flags, const int fd, const off_t offset, + const char *msg); + +void scoria_sm_unmap(void *ptr, const size_t length, const char *msg); + +#endif /* POSIX_SM_H */ diff --git a/include/shared/backend-support-tests.h b/include/shared/backend-support-tests.h new file mode 100644 index 0000000..aaa1467 --- /dev/null +++ b/include/shared/backend-support-tests.h @@ -0,0 +1,49 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. + +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ +#ifndef BACKEND_SUPPORT_TESTS_H +#define BACKEND_SUPPORT_TESTS_H +int sg_cuda_support(); +int sg_opencl_support(); +int sg_openmp_support(); +int sg_serial_support(); +#endif diff --git a/include/shared/json.h b/include/shared/json.h new file mode 100644 index 0000000..25628c3 --- /dev/null +++ b/include/shared/json.h @@ -0,0 +1,243 @@ + +/* vim: set et ts=3 sw=3 sts=3 ft=c: + * + * Copyright (C) 2012, 2013, 2014 James McLaughlin et al. All rights reserved. + * https://github.com/udp/json-parser + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _JSON_H +#define _JSON_H + +#ifndef json_char +#define json_char char +#endif + +#ifndef json_int_t +#ifndef _MSC_VER +#include +#define json_int_t int64_t +#else +#define json_int_t __int64 +#endif +#endif + +#include + +#ifdef __cplusplus + +#include + +extern "C" { + +#endif + +typedef struct { + unsigned long max_memory; + int settings; + + /* Custom allocator support (leave null to use malloc/free) + */ + + void *(*mem_alloc)(size_t, int zero, void *user_data); + void (*mem_free)(void *, void *user_data); + + void *user_data; /* will be passed to mem_alloc and mem_free */ + + size_t value_extra; /* how much extra space to allocate for values? */ + +} json_settings; + +#define json_enable_comments 0x01 + +typedef enum { + json_none, + json_object, + json_array, + json_integer, + json_double, + json_string, + json_boolean, + json_null + +} json_type; + +extern const struct _json_value json_value_none; + +typedef struct _json_object_entry { + json_char *name; + unsigned int name_length; + + struct _json_value *value; + +} json_object_entry; + +typedef struct _json_value { + struct _json_value *parent; + + json_type type; + + union { + int boolean; + json_int_t integer; + double dbl; + + struct { + unsigned int length; + json_char *ptr; /* null terminated */ + + } string; + + struct { + unsigned int length; + + json_object_entry *values; + +#if defined(__cplusplus) && __cplusplus >= 201103L + decltype(values) begin() const { return values; } + decltype(values) end() const { return values + length; } +#endif + + } object; + + struct { + unsigned int length; + struct _json_value **values; + +#if defined(__cplusplus) && __cplusplus >= 201103L + decltype(values) begin() const { return values; } + decltype(values) end() const { return values + length; } +#endif + + } array; + + } u; + + union { + struct _json_value *next_alloc; + void *object_mem; + + } _reserved; + +#ifdef JSON_TRACK_SOURCE + + /* Location of the value in the source JSON + */ + unsigned int line, col; + +#endif + + /* Some C++ operator sugar */ + +#ifdef __cplusplus + +public: + inline _json_value() { memset(this, 0, sizeof(_json_value)); } + + inline const struct _json_value &operator[](int index) const { + if (type != json_array || index < 0 || + ((unsigned int)index) >= u.array.length) { + return json_value_none; + } + + return *u.array.values[index]; + } + + inline const struct _json_value &operator[](const char *index) const { + if (type != json_object) + return json_value_none; + + for (unsigned int i = 0; i < u.object.length; ++i) + if (!strcmp(u.object.values[i].name, index)) + return *u.object.values[i].value; + + return json_value_none; + } + + inline operator const char *() const { + switch (type) { + case json_string: + return u.string.ptr; + + default: + return ""; + }; + } + + inline operator json_int_t() const { + switch (type) { + case json_integer: + return u.integer; + + case json_double: + return (json_int_t)u.dbl; + + default: + return 0; + }; + } + + inline operator bool() const { + if (type != json_boolean) + return false; + + return u.boolean != 0; + } + + inline operator double() const { + switch (type) { + case json_integer: + return (double)u.integer; + + case json_double: + return u.dbl; + + default: + return 0; + }; + } + +#endif + +} json_value; + +json_value *json_parse(const json_char *json, size_t length); + +#define json_error_max 128 +json_value *json_parse_ex(json_settings *settings, const json_char *json, + size_t length, char *error); + +void json_value_free(json_value *); + +/* Not usually necessary, unless you used a custom mem_alloc and now want to + * use a custom mem_free. + */ +void json_value_free_ex(json_settings *settings, json_value *); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/include/shared/kernels.h b/include/shared/kernels.h new file mode 100644 index 0000000..e3944a5 --- /dev/null +++ b/include/shared/kernels.h @@ -0,0 +1,396 @@ +#include +#include +#include +#include +#include + +#include + +static_assert(sizeof(size_t) == 8, "size_t is expected to be a 64-bit integer"); + +#define FORCE_INLINE __attribute__((always_inline)) static inline + +// =========================================================================== +// AVX KERNELS +// =========================================================================== + +#define READ_0_AVX(buffer, res, start, end) \ + /* process in chunks of 8 elements */ \ + __m512d vals; \ + \ + size_t idx = start; \ + /* subtract 7 from end to not go over, we'll deal with leftovers below */ \ + for (; idx < end - 7; idx += 8) { \ + vals = _mm512_load_pd(buffer + idx); \ + _mm512_store_pd(res + idx, vals); \ + } \ + \ + /* deal with leftovers */ \ + for (; idx < end; ++idx) { \ + res[idx] = buffer[idx]; \ + } + +#define READ_1_AVX(buffer, res, ind, start, end) \ + /* process in chunks of 8 elements */ \ + __m512d vals; \ + __m512i indices; \ + \ + size_t idx = start; \ + /* subtract 7 from end to not go over, we'll deal with leftovers below */ \ + for (; idx < end - 7; idx += 8) { \ + indices = _mm512_load_epi64(ind + idx); \ + vals = _mm512_i64gather_pd(indices, buffer, 8); \ + _mm512_store_pd(res + idx, vals); \ + } \ + \ + /* deal with leftovers */ \ + for (; idx < end; ++idx) { \ + res[idx] = buffer[ind[idx]]; \ + } + +#define READ_2_AVX(buffer, res, ind1, ind2, start, end) \ + /* process in chunks of 8 elements */ \ + __m512d vals; \ + __m512i indices1, indices2; \ + \ + size_t idx = start; \ + /* subtract 7 from end to not go over, we'll deal with leftovers below */ \ + for (; idx < end - 7; idx += 8) { \ + indices1 = _mm512_load_epi64(ind1 + idx); \ + indices2 = _mm512_i64gather_epi64(indices1, ind2, 8); \ + vals = _mm512_i64gather_pd(indices2, buffer, 8); \ + _mm512_store_pd(res + idx, vals); \ + } \ + \ + /* deal with leftovers */ \ + for (; idx < end; ++idx) { \ + res[idx] = buffer[ind2[ind1[idx]]]; \ + } + +#define WRITE_0_AVX(buffer, input, start, end) \ + /* process in chunks of 8 elements */ \ + __m512d vals; \ + \ + size_t idx = start; \ + /* subtract 7 from end to not go over, we'll deal with leftovers below */ \ + for (; idx < end - 7; idx += 8) { \ + vals = _mm512_load_pd(input + idx); \ + _mm512_store_pd(buffer + idx, vals); \ + } \ + \ + /* deal with leftovers */ \ + for (; idx < end; ++idx) { \ + buffer[idx] = input[idx]; \ + } + +#define WRITE_1_AVX(buffer, input, ind, start, end) \ + /* process in chunks of 8 elements */ \ + __m512d vals; \ + __m512i indices; \ + \ + size_t idx = start; \ + /* subtract 7 from end to not go over, we'll deal with leftovers below */ \ + for (; idx < end - 7; idx += 8) { \ + indices = _mm512_load_epi64(ind + idx); \ + vals = _mm512_load_pd(input + idx); \ + _mm512_i64scatter_pd(buffer, indices, vals, 8); \ + } \ + \ + /* deal with leftovers */ \ + for (; idx < end; ++idx) { \ + buffer[ind[idx]] = input[idx]; \ + } + +#define WRITE_2_AVX(buffer, input, ind1, ind2, start, end) \ + /* process in chunks of 8 elements */ \ + __m512d vals; \ + __m512i indices1, indices2; \ + \ + size_t idx = start; \ + /* subtract 7 from end to not go over, we'll deal with leftovers below */ \ + for (; idx < end - 7; idx += 8) { \ + indices1 = _mm512_load_epi64(ind1 + idx); \ + indices2 = _mm512_i64gather_epi64(indices1, ind2, 8); \ + vals = _mm512_load_pd(input + idx); \ + _mm512_i64scatter_pd(buffer, indices2, vals, 8); \ + } \ + \ + /* deal with leftovers */ \ + for (; idx < end; ++idx) { \ + buffer[ind2[ind1[idx]]] = input[idx]; \ + } + +// =========================================================================== +// SINGLE THREADED +// =========================================================================== + +FORCE_INLINE void read_single_thread_0(double *res, const double *buffer, + size_t N, bool use_avx) { + if (use_avx) { + READ_0_AVX(buffer, res, 0, N) + } else { + for (size_t i = 0; i < N; ++i) { + res[i] = buffer[i]; + } + } +} + +FORCE_INLINE void read_single_thread_1(double *res, const double *buffer, + size_t N, const size_t *ind1, + bool use_avx) { + if (use_avx) { + READ_1_AVX(buffer, res, ind1, 0, N) + } else { + for (size_t i = 0; i < N; ++i) { + res[i] = buffer[ind1[i]]; + } + } +} + +FORCE_INLINE void read_single_thread_2(double *res, const double *buffer, + size_t N, const size_t *ind1, + const size_t *ind2, bool use_avx) { + if (use_avx) { + READ_2_AVX(buffer, res, ind1, ind2, 0, N) + } else { + for (size_t i = 0; i < N; ++i) { + res[i] = buffer[ind2[ind1[i]]]; + } + } +} + +FORCE_INLINE void write_single_thread_0(double *buffer, const double *input, + size_t N, bool use_avx) { + if (use_avx) { + WRITE_0_AVX(buffer, input, 0, N); + } else { + for (size_t i = 0; i < N; ++i) { + buffer[i] = input[i]; + } + } +} + +FORCE_INLINE void write_single_thread_1(double *buffer, const double *input, + size_t N, const size_t *ind1, + bool use_avx) { + for (size_t i = 0; i < N; ++i) { + buffer[ind1[i]] = input[i]; + } +} + +FORCE_INLINE void write_single_thread_2(double *buffer, const double *input, + size_t N, const size_t *ind1, + const size_t *ind2, bool use_avx) { + for (size_t i = 0; i < N; ++i) { + buffer[ind2[ind1[i]]] = input[i]; + } +} + +// =========================================================================== +// MULTI THREADED +// =========================================================================== + +#define MIN(x, y) (x < y ? x : y) +#define MAX(x, y) (x > y ? x : y) + +#define THREAD_TEMPLATE(N, n_threads, thread_args, thread_func, \ + extra_args_setup) \ + size_t chunk_size = (N + n_threads - 1) / n_threads; /* round up */ \ + \ + pthread_t threads[n_threads]; \ + struct thread_args args[n_threads]; \ + \ + for (size_t i = 0; i < n_threads; ++i) { \ + args[i].buffer = buffer; \ + args[i].start = i * chunk_size; \ + args[i].end = MIN((i + 1) * chunk_size, N); \ + extra_args_setup; \ + \ + int ret = pthread_create(&threads[i], NULL, thread_func, &args[i]); \ + (void)ret; \ + assert(ret == 0); \ + } \ + \ + for (size_t i = 0; i < n_threads; ++i) { \ + pthread_join(threads[i], NULL); \ + } + +struct read_th_args_0 { + double *res; + const double *buffer; + size_t start, end; +}; + +void *read_th_0(void *args) { + struct read_th_args_0 *a = args; + for (size_t i = a->start; i < a->end; ++i) { + a->res[i] = a->buffer[i]; + } + return NULL; +} + +void *read_th_0_avx(void *args) { + struct read_th_args_0 *a = args; + READ_0_AVX(a->buffer, a->res, a->start, a->end) + return NULL; +} + +FORCE_INLINE void read_multi_thread_0(double *res, const double *buffer, + size_t N, size_t n_threads, + bool use_avx) { + THREAD_TEMPLATE(N, n_threads, read_th_args_0, + use_avx ? read_th_0_avx : read_th_0, { args[i].res = res; }); +} + +struct read_th_args_1 { + double *res; + const double *buffer; + size_t start, end; + const size_t *ind1; +}; + +void *read_th_1(void *args) { + struct read_th_args_1 *a = args; + for (size_t i = a->start; i < a->end; ++i) { + a->res[i] = a->buffer[a->ind1[i]]; + } + return NULL; +} + +void *read_th_1_avx(void *args) { + struct read_th_args_1 *a = args; + READ_1_AVX(a->buffer, a->res, a->ind1, a->start, a->end) + return NULL; +} + +FORCE_INLINE void read_multi_thread_1(double *res, const double *buffer, + size_t N, const size_t *ind1, + size_t n_threads, bool use_avx) { + THREAD_TEMPLATE(N, n_threads, read_th_args_1, + use_avx ? read_th_1_avx : read_th_1, { + args[i].res = res; + args[i].ind1 = ind1; + }); +} + +struct read_th_args_2 { + double *res; + const double *buffer; + size_t start, end; + const size_t *ind1, *ind2; +}; + +void *read_th_2(void *args) { + struct read_th_args_2 *a = args; + for (size_t i = a->start; i < a->end; ++i) { + a->res[i] = a->buffer[a->ind2[a->ind1[i]]]; + } + return NULL; +} + +void *read_th_2_avx(void *args) { + struct read_th_args_2 *a = args; + READ_2_AVX(a->buffer, a->res, a->ind1, a->ind2, a->start, a->end) + return NULL; +} + +FORCE_INLINE void read_multi_thread_2(double *res, const double *buffer, + size_t N, const size_t *ind1, + const size_t *ind2, size_t n_threads, + bool use_avx) { + THREAD_TEMPLATE(N, n_threads, read_th_args_2, + use_avx ? read_th_2_avx : read_th_2, { + args[i].res = res; + args[i].ind1 = ind1; + args[i].ind2 = ind2; + }); +} + +struct write_th_args_0 { + double *buffer; + const double *input; + size_t start, end; +}; + +void *write_th_0(void *args) { + struct write_th_args_0 *a = args; + for (size_t i = a->start; i < a->end; ++i) { + a->buffer[i] = a->input[i]; + } + return NULL; +} + +void *write_th_0_avx(void *args) { + struct write_th_args_0 *a = args; + WRITE_0_AVX(a->buffer, a->input, a->start, a->end) + return NULL; +} + +FORCE_INLINE void write_multi_thread_0(double *buffer, const double *input, + size_t N, size_t n_threads, + bool use_avx) { + THREAD_TEMPLATE(N, n_threads, write_th_args_0, write_th_0, + { args[i].input = input; }); +} + +struct write_th_args_1 { + double *buffer; + const double *input; + size_t start, end; + const size_t *ind1; +}; + +void *write_th_1(void *args) { + struct write_th_args_1 *a = args; + for (size_t i = a->start; i < a->end; ++i) { + a->buffer[a->ind1[i]] = a->input[i]; + } + return NULL; +} + +void *write_th_1_avx(void *args) { + struct write_th_args_1 *a = args; + WRITE_1_AVX(a->buffer, a->input, a->ind1, a->start, a->end) + return NULL; +} + +FORCE_INLINE +void write_multi_thread_1(double *buffer, const double *input, size_t N, + const size_t *ind1, size_t n_threads, bool use_avx) { + THREAD_TEMPLATE(N, n_threads, write_th_args_1, write_th_1, { + args[i].input = input; + args[i].ind1 = ind1; + }); +} + +struct write_th_args_2 { + double *buffer; + const double *input; + size_t start, end; + const size_t *ind1, *ind2; +}; + +void *write_th_2(void *args) { + struct write_th_args_2 *a = args; + for (size_t i = a->start; i < a->end; ++i) { + a->buffer[a->ind2[a->ind1[i]]] = a->input[i]; + } + return NULL; +} + +void *write_th_2_avx(void *args) { + struct write_th_args_2 *a = args; + WRITE_2_AVX(a->buffer, a->input, a->ind1, a->ind2, a->start, a->end) + return NULL; +} + +FORCE_INLINE void write_multi_thread_2(double *buffer, const double *input, + size_t N, const size_t *ind1, + const size_t *ind2, size_t n_threads, + bool use_avx) { + THREAD_TEMPLATE(N, n_threads, write_th_args_2, write_th_2, { + args[i].input = input; + args[i].ind1 = ind1; + args[i].ind2 = ind2; + }); +} diff --git a/include/shared/parse-args.h b/include/shared/parse-args.h new file mode 100644 index 0000000..3eb6864 --- /dev/null +++ b/include/shared/parse-args.h @@ -0,0 +1,167 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ + +/** @file parse-args.h + * @author Patrick Lavin + * @brief Provides a function to read CLI + */ + +#ifndef PARSE_ARGS_H +#define PARSE_ARGS_H + +#define WARN 0 +#define ERROR 1 + +#define STRING_SIZE 1000000 +#define MAX_PATTERN_LEN 1048576 + +#include +#include +#include + +/** @brief Supported benchmark backends + */ +enum sg_backend { + OPENCL, /**< OpenCL Backend */ + OPENMP, /**< OpenMP CPU Backend */ + CUDA, /**< CUDA Backend */ + SERIAL, /**< SERIAL Backend */ + INVALID_BACKEND /**< Used as a default backend */ +}; + +enum sg_kernel { + INVALID_KERNEL = 0, + SCATTER, + GATHER, + GS, +}; + +enum sg_op { OP_COPY, OP_ACCUM, INVALID_OP }; + +// Specifies the indexing or offset type +enum idx_type { + UNIFORM, + MS1, + LAPLACIAN, + CUSTOM, + CONFIG_FILE, + XKP, + INVALID_IDX +}; + +/* +enum state +{ + NOTRUN, + INVALID_STATE, + VALID_STATE +}; +*/ + +struct run_config { + // keep arrays at top so they are aligned + spIdx_t *pattern; + spIdx_t *pattern_gather; + spIdx_t *pattern_scatter; + size_t *deltas; + size_t *deltas_ps; + size_t *deltas_gather; + size_t *deltas_gather_ps; + size_t *deltas_scatter; + size_t *deltas_scatter_ps; + spSize_t pattern_len; + spSize_t pattern_gather_len; + spSize_t pattern_scatter_len; + ssize_t delta; + size_t deltas_len; + ssize_t delta_gather; + size_t deltas_gather_len; + ssize_t delta_scatter; + size_t deltas_scatter_len; + enum sg_kernel kernel; + enum idx_type type; + enum idx_type type_gather; + enum idx_type type_scatter; + spSize_t generic_len; + size_t wrap; + size_t nruns; + char pattern_file[STRING_SIZE]; + char generator[STRING_SIZE]; + char name[STRING_SIZE]; + size_t random_seed; + size_t omp_threads; + enum sg_op op; + size_t vector_len; + unsigned int shmem; + size_t local_work_size; + double *time_ms; + long long **papi_ctr; + int papi_counters; + int stride_kernel; + // Reorder based kernels + int ro_morton; + int ro_hilbert; + int ro_block; + uint32_t *ro_order; + uint32_t *ro_order_dev; +}; + +struct backend_config { + enum sg_backend backend; + enum sg_kernel kernel; + enum sg_op op; + + char platform_string[STRING_SIZE]; + char device_string[STRING_SIZE]; + char kernel_file[STRING_SIZE]; + char kernel_name[STRING_SIZE]; +}; + +/** @brief Read command-line arguments and populate global variables. + * @param argc Value passed to main + * @param argv Value passed to main + */ +void parse_args(int argc, char **argv, int *nrc, struct run_config **rc); +struct run_config *parse_runs(int arrr, char **argv); +void error(char *what, int code); +void print_run_config(struct run_config rc); +#endif diff --git a/include/shared/pcg_basic.h b/include/shared/pcg_basic.h new file mode 100644 index 0000000..6a47067 --- /dev/null +++ b/include/shared/pcg_basic.h @@ -0,0 +1,78 @@ +/* + * PCG Random Number Generation for C. + * + * Copyright 2014 Melissa O'Neill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For additional information about the PCG random number generation scheme, + * including its license and other licensing options, visit + * + * http://www.pcg-random.org + */ + +/* + * This code is derived from the full C implementation, which is in turn + * derived from the canonical C++ PCG implementation. The C++ version + * has many additional features and is preferable if you can use C++ in + * your project. + */ + +#ifndef PCG_BASIC_H_INCLUDED +#define PCG_BASIC_H_INCLUDED 1 + +#include + +#if __cplusplus +extern "C" { +#endif + +struct pcg_state_setseq_64 { // Internals are *Private*. + uint64_t state; // RNG state. All values are possible. + uint64_t inc; // Controls which RNG sequence (stream) is + // selected. Must *always* be odd. +}; +typedef struct pcg_state_setseq_64 pcg32_random_t; + +// If you *must* statically initialize it, here's one. + +#define PCG32_INITIALIZER \ + { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL } + +// pcg32_srandom(initstate, initseq) +// pcg32_srandom_r(rng, initstate, initseq): +// Seed the rng. Specified in two parts, state initializer and a +// sequence selection constant (a.k.a. stream id) + +void pcg32_srandom(uint64_t initstate, uint64_t initseq); +void pcg32_srandom_r(pcg32_random_t *rng, uint64_t initstate, uint64_t initseq); + +// pcg32_random() +// pcg32_random_r(rng) +// Generate a uniformly distributed 32-bit random number + +uint32_t pcg32_random(void); +uint32_t pcg32_random_r(pcg32_random_t *rng); + +// pcg32_boundedrand(bound): +// pcg32_boundedrand_r(rng, bound): +// Generate a uniformly distributed number, r, where 0 <= r < bound + +uint32_t pcg32_boundedrand(uint32_t bound); +uint32_t pcg32_boundedrand_r(pcg32_random_t *rng, uint32_t bound); + +#if __cplusplus +} +#endif + +#endif // PCG_BASIC_H_INCLUDED diff --git a/include/shared/request.h b/include/shared/request.h new file mode 100644 index 0000000..1deab0e --- /dev/null +++ b/include/shared/request.h @@ -0,0 +1,79 @@ +#ifndef REQUEST_H +#define REQUEST_H + +#include "uthash.h" + +#include "config.h" + +#include +#include +#include + +typedef enum { Read, Write, Quit, Kill } request_type; +typedef enum { Waiting, Ready } request_status; + +struct request { + int client; + int id; + + request_type r_type; + request_status r_status; + size_t size; + + size_t N; + + const void *input; + void *output; + + const size_t *ind1; + const size_t *ind2; + size_t nthreads; + bool use_avx; + + size_t offset; + double value; + + UT_hash_handle hh; +}; + +typedef struct request_queue { + int client; + int active; + + struct request requests[REQUEST_QUEUE_SIZE]; + + struct request *head; + struct request *tail; + + size_t capacity; + size_t count; + size_t size; + + struct request *begin; + struct request *end; + + pthread_mutexattr_t attr_lock; + pthread_condattr_t attr_empty; + pthread_condattr_t attr_fill; + + pthread_mutex_t lock; + pthread_cond_t empty, fill; +} request_queue; + +void request_queue_init(request_queue *rq); +void request_queue_free(request_queue *rq); + +void request_queue_put(request_queue *rq, const struct request *item); +void request_queue_fetch(request_queue *rq, struct request *item); + +void request_queue_activate(request_queue *rq, int id); +void request_queue_deactivate(request_queue *rq); + +typedef struct request_queue_list { + struct request_queue queues[MAX_CLIENTS]; +} request_queue_list; + +void request_queue_list_init(request_queue_list *rql); +void request_queue_list_free(request_queue_list *rql); + +#endif /* REQUEST_H */ diff --git a/include/shared/sgtype.h b/include/shared/sgtype.h new file mode 100644 index 0000000..1459d19 --- /dev/null +++ b/include/shared/sgtype.h @@ -0,0 +1,80 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ + +/** @file sgtype.h + * @author Patrick Lavin + * @brief A simple include file which can be edited to change the data type + */ + +#ifndef SGTYPE_H +#define SGTYPE_H +#include +#include + +#ifdef USE_OPENCL +#include "cl-helper.h" +static_assert(sizeof(cl_ulong) == sizeof(unsigned long), + "Due to size differences between cl_ulong and unsigned long, we " + "cannot compile with OpenCL support on your system"); +static_assert(sizeof(cl_double) == sizeof(double), + "Due to size differences between cl_double and double, we cannot " + "compile with OpenCL support on your system"); +static_assert(sizeof(cl_uint) == sizeof(unsigned int), + "Due to size differences between cl_uint and unsigned int, we " + "cannot compile with OpenCL support on your system"); +static_assert(sizeof(cl_float) == sizeof(float), + "Due to size differences between cl_double and double, we cannot " + "compile with OpenCL support on your system"); +#endif + +typedef double sgData_t; +#define SGD "%lf" +typedef unsigned long sgIdx_t; +typedef unsigned long spIdx_t; +#define SGI "%lu" +typedef long sgsIdx_t; +#define SGS "%ld" + +typedef size_t spSize_t; +#define SPS "%zu" + +#endif // endif SGTYPE diff --git a/include/shared/sp_alloc.h b/include/shared/sp_alloc.h new file mode 100644 index 0000000..4af6090 --- /dev/null +++ b/include/shared/sp_alloc.h @@ -0,0 +1,57 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ + +#ifndef SP_ALLOC_H +#define SP_ALLOC_H + +#include + +#ifndef SP_MAX_ALLOC +// 65GB +#define SP_MAX_ALLOC (65ll * 1000 * 1000 * 1000) +#endif +#define ALIGN_CACHE 64 +#define ALIGN_PAGE 4096 +void *sp_malloc(size_t size, size_t count, size_t align); +void *sp_calloc(size_t size, size_t count, size_t align); +long long get_mem_used(); +#endif diff --git a/include/shared/utils.h b/include/shared/utils.h new file mode 100644 index 0000000..cbdf085 --- /dev/null +++ b/include/shared/utils.h @@ -0,0 +1,7 @@ +#ifndef UTILS_H +#define UTILS_H + +void setup(); +void scoria_error(const char *msg); + +#endif /* UTILS_H */ diff --git a/include/uthash/uthash.h b/include/uthash/uthash.h new file mode 100644 index 0000000..83ead39 --- /dev/null +++ b/include/uthash/uthash.h @@ -0,0 +1,1248 @@ +/* + * Copyright (c) 2003-2022, Troy D. Hanson https://troydhanson.github.io/uthash/ + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * */ + +#ifndef UTHASH_H +#define UTHASH_H + +#define UTHASH_VERSION 2.3.0 + +#include /* ptrdiff_t */ +#include /* exit */ +#include /* memcmp, memset, strlen */ + +#if defined(HASH_DEFINE_OWN_STDINT) && HASH_DEFINE_OWN_STDINT +/* This codepath is provided for backward compatibility, but I plan to remove + * it. */ +#warning \ + "HASH_DEFINE_OWN_STDINT is deprecated; please use HASH_NO_STDINT instead" +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#elif defined(HASH_NO_STDINT) && HASH_NO_STDINT +#else +#include /* uint8_t, uint32_t */ +#endif + +/* These macros use decltype or the earlier __typeof GNU extension. + * As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + * when compiling c++ source) this code uses whatever method is needed + * or, for VS2008 where neither is available, uses casting workarounds. + */ +#if !defined(DECLTYPE) && !defined(NO_DECLTYPE) +#if defined(_MSC_VER) /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#endif +#elif defined(__MCST__) /* Elbrus C Compiler */ +#define DECLTYPE(x) (__typeof(x)) +#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) || \ + defined(__WATCOMC__) +#define NO_DECLTYPE +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE(x) +#define DECLTYPE_ASSIGN(dst, src) \ + do { \ + char **_da_dst = (char **)(&(dst)); \ + *_da_dst = (char *)(src); \ + } while (0) +#else +#define DECLTYPE_ASSIGN(dst, src) \ + do { \ + (dst) = DECLTYPE(dst)(src); \ + } while (0) +#endif + +#ifndef uthash_malloc +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#endif +#ifndef uthash_free +#define uthash_free(ptr, sz) free(ptr) /* free fcn */ +#endif +#ifndef uthash_bzero +#define uthash_bzero(a, n) memset(a, '\0', n) +#endif +#ifndef uthash_strlen +#define uthash_strlen(s) strlen(s) +#endif + +#ifndef HASH_FUNCTION +#define HASH_FUNCTION(keyptr, keylen, hashv) HASH_JEN(keyptr, keylen, hashv) +#endif + +#ifndef HASH_KEYCMP +#define HASH_KEYCMP(a, b, n) memcmp(a, b, n) +#endif + +#ifndef uthash_noexpand_fyi +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#endif +#ifndef uthash_expand_fyi +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ +#endif + +#ifndef HASH_NONFATAL_OOM +#define HASH_NONFATAL_OOM 0 +#endif + +#if HASH_NONFATAL_OOM +/* malloc failures can be recovered from */ + +#ifndef uthash_nonfatal_oom +#define uthash_nonfatal_oom(obj) \ + do { \ + } while (0) /* non-fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) \ + do { \ + (oomed) = 1; \ + } while (0) +#define IF_HASH_NONFATAL_OOM(x) x + +#else +/* malloc failures result in lost memory, hash tables are unusable */ + +#ifndef uthash_fatal +#define uthash_fatal(msg) exit(-1) /* fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory") +#define IF_HASH_NONFATAL_OOM(x) + +#endif + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32U /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 \ + 5U /* lg2 of initial number of buckets \ + */ +#define HASH_BKT_CAPACITY_THRESH 10U /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhp */ +#define ELMT_FROM_HH(tbl, hhp) ((void *)(((char *)(hhp)) - ((tbl)->hho))) +/* calculate the hash handle from element address elp */ +#define HH_FROM_ELMT(tbl, elp) \ + ((UT_hash_handle *)(void *)(((char *)(elp)) + ((tbl)->hho))) + +#define HASH_ROLLBACK_BKT(hh, head, itemptrhh) \ + do { \ + struct UT_hash_handle *_hd_hh_item = (itemptrhh); \ + unsigned _hd_bkt; \ + HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + (head)->hh.tbl->buckets[_hd_bkt].count++; \ + _hd_hh_item->hh_next = NULL; \ + _hd_hh_item->hh_prev = NULL; \ + } while (0) + +#define HASH_VALUE(keyptr, keylen, hashv) \ + do { \ + HASH_FUNCTION(keyptr, keylen, hashv); \ + } while (0) + +#define HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, hashval, out) \ + do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_bkt; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, hashval) != 0) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[_hf_bkt], \ + keyptr, keylen, hashval, out); \ + } \ + } \ + } while (0) + +#define HASH_FIND(hh, head, keyptr, keylen, out) \ + do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_hashv; \ + HASH_VALUE(keyptr, keylen, _hf_hashv); \ + HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out); \ + } \ + } while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN \ + (HASH_BLOOM_BITLEN / 8UL) + (((HASH_BLOOM_BITLEN % 8UL) != 0UL) ? 1UL : 0UL) +#define HASH_BLOOM_MAKE(tbl, oomed) \ + do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t *)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!(tbl)->bloom_bv) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ + } \ + } while (0) + +#define HASH_BLOOM_FREE(tbl) \ + do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + } while (0) + +#define HASH_BLOOM_BITSET(bv, idx) (bv[(idx) / 8U] |= (1U << ((idx) % 8U))) +#define HASH_BLOOM_BITTEST(bv, idx) (bv[(idx) / 8U] & (1U << ((idx) % 8U))) + +#define HASH_BLOOM_ADD(tbl, hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, \ + ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#define HASH_BLOOM_TEST(tbl, hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, \ + ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#else +#define HASH_BLOOM_MAKE(tbl, oomed) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl, hashv) +#define HASH_BLOOM_TEST(tbl, hashv) (1) +#define HASH_BLOOM_BYTELEN 0U +#endif + +#define HASH_MAKE_TABLE(hh, head, oomed) \ + do { \ + (head)->hh.tbl = (UT_hash_table *)uthash_malloc(sizeof(UT_hash_table)); \ + if (!(head)->hh.tbl) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char *)(&(head)->hh) - (char *)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket *)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ + if (!(head)->hh.tbl->buckets) { \ + HASH_RECORD_OOM(oomed); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } else { \ + uthash_bzero((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * \ + sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl, oomed); \ + IF_HASH_NONFATAL_OOM(if (oomed) { \ + uthash_free((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * \ + sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + }) \ + } \ + } \ + } while (0) + +#define HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, \ + hashval, add, replaced, cmpfcn) \ + do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, \ + replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), \ + keylen_in, hashval, add, cmpfcn); \ + } while (0) + +#define HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, hashval, add, \ + replaced) \ + do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, \ + replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, \ + hashval, add); \ + } while (0) + +#define HASH_REPLACE(hh, head, fieldname, keylen_in, add, replaced) \ + do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add, \ + replaced); \ + } while (0) + +#define HASH_REPLACE_INORDER(hh, head, fieldname, keylen_in, add, replaced, \ + cmpfcn) \ + do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, \ + _hr_hashv, add, replaced, cmpfcn); \ + } while (0) + +#define HASH_APPEND_LIST(hh, head, add) \ + do { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail->next = (add); \ + (head)->hh.tbl->tail = &((add)->hh); \ + } while (0) + +#define HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn) \ + do { \ + do { \ + if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) { \ + break; \ + } \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ + } while (0) + +#ifdef NO_DECLTYPE +#undef HASH_AKBI_INNER_LOOP +#define HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn) \ + do { \ + char *_hs_saved_head = (char *)(head); \ + do { \ + DECLTYPE_ASSIGN(head, _hs_iter); \ + if (cmpfcn(head, add) > 0) { \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + break; \ + } \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ + } while (0) +#endif + +#if HASH_NONFATAL_OOM + +#define HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, oomed) \ + do { \ + if (!(oomed)) { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, \ + oomed); \ + if (oomed) { \ + HASH_ROLLBACK_BKT(hh, head, &(add)->hh); \ + HASH_DELETE_HH(hh, head, &(add)->hh); \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } else { \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } \ + } else { \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } \ + } while (0) + +#else + +#define HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, oomed) \ + do { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } while (0) + +#endif + +#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, \ + hashval, add, cmpfcn) \ + do { \ + IF_HASH_NONFATAL_OOM(int _ha_oomed = 0;) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (char *)(keyptr); \ + (add)->hh.keylen = (unsigned)(keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM(if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( \ + }) \ + } else { \ + void *_hs_iter = (head); \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn); \ + if (_hs_iter) { \ + (add)->hh.next = _hs_iter; \ + if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) { \ + HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add); \ + } else { \ + (head) = (add); \ + } \ + HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add); \ + } else { \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER"); \ + } while (0) + +#define HASH_ADD_KEYPTR_INORDER(hh, head, keyptr, keylen_in, add, cmpfcn) \ + do { \ + unsigned _hs_hashv; \ + HASH_VALUE(keyptr, keylen_in, _hs_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, \ + _hs_hashv, add, cmpfcn); \ + } while (0) + +#define HASH_ADD_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, hashval, \ + add, cmpfcn) \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), \ + keylen_in, hashval, add, cmpfcn) + +#define HASH_ADD_INORDER(hh, head, fieldname, keylen_in, add, cmpfcn) \ + HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn) + +#define HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, hashval, add) \ + do { \ + IF_HASH_NONFATAL_OOM(int _ha_oomed = 0;) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (const void *)(keyptr); \ + (add)->hh.keylen = (unsigned)(keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM(if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( \ + }) \ + } else { \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE"); \ + } while (0) + +#define HASH_ADD_KEYPTR(hh, head, keyptr, keylen_in, add) \ + do { \ + unsigned _ha_hashv; \ + HASH_VALUE(keyptr, keylen_in, _ha_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add); \ + } while (0) + +#define HASH_ADD_BYHASHVALUE(hh, head, fieldname, keylen_in, hashval, add) \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, \ + hashval, add) + +#define HASH_ADD(hh, head, fieldname, keylen_in, add) \ + HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add) + +#define HASH_TO_BKT(hashv, num_bkts, bkt) \ + do { \ + bkt = ((hashv) & ((num_bkts)-1U)); \ + } while (0) + +/* delete "delptr" from the hash table. + * * "the usual" patch-up process for the app-order doubly-linked-list. + * * The use of _hd_hh_del below deserves special explanation. + * * These used to be expressed using (delptr) but that led to a bug + * * if someone used the same symbol for the head and deletee, like + * * HASH_DELETE(hh,users,users); + * * We want that to work, but by changing the head (users) below + * * we were forfeiting our ability to further refer to the deletee + * (users) + * * in the patch-up process. Solution: use scratch space to + * * copy the deletee pointer, then the latter references are via that + * * scratch pointer rather than through the repointed (users) symbol. + * */ +#define HASH_DELETE(hh, head, delptr) HASH_DELETE_HH(hh, head, &(delptr)->hh) + +#define HASH_DELETE_HH(hh, head, delptrhh) \ + do { \ + struct UT_hash_handle *_hd_hh_del = (delptrhh); \ + if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, (head)->hh.tbl->num_buckets * \ + sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } else { \ + unsigned _hd_bkt; \ + if (_hd_hh_del == (head)->hh.tbl->tail) { \ + (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev); \ + } \ + if (_hd_hh_del->prev != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next = \ + _hd_hh_del->next; \ + } else { \ + DECLTYPE_ASSIGN(head, _hd_hh_del->next); \ + } \ + if (_hd_hh_del->next != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev = \ + _hd_hh_del->prev; \ + } \ + HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh, head, "HASH_DELETE_HH"); \ + } while (0) + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head, findstr, out) \ + do { \ + unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr); \ + HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out); \ + } while (0) +#define HASH_ADD_STR(head, strfield, add) \ + do { \ + unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add); \ + } while (0) +#define HASH_REPLACE_STR(head, strfield, add, replaced) \ + do { \ + unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced); \ + } while (0) +#define HASH_FIND_INT(head, findint, out) \ + HASH_FIND(hh, head, findint, sizeof(int), out) +#define HASH_ADD_INT(head, intfield, add) \ + HASH_ADD(hh, head, intfield, sizeof(int), add) +#define HASH_REPLACE_INT(head, intfield, add, replaced) \ + HASH_REPLACE(hh, head, intfield, sizeof(int), add, replaced) +#define HASH_FIND_PTR(head, findptr, out) \ + HASH_FIND(hh, head, findptr, sizeof(void *), out) +#define HASH_ADD_PTR(head, ptrfield, add) \ + HASH_ADD(hh, head, ptrfield, sizeof(void *), add) +#define HASH_REPLACE_PTR(head, ptrfield, add, replaced) \ + HASH_REPLACE(hh, head, ptrfield, sizeof(void *), add, replaced) +#define HASH_DEL(head, delptr) HASH_DELETE(hh, head, delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is + * defined. + * * This is for uthash developer only; it compiles away if HASH_DEBUG isn't + * defined. + * */ +#ifdef HASH_DEBUG +#include /* fprintf, stderr */ +#define HASH_OOPS(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + exit(-1); \ + } while (0) +#define HASH_FSCK(hh, head, where) \ + do { \ + struct UT_hash_handle *_thh; \ + if (head) { \ + unsigned _bkt_i; \ + unsigned _count = 0; \ + char *_prev; \ + for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) { \ + unsigned _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char *)(_thh->hh_prev)) { \ + HASH_OOPS("%s: invalid hh_prev %p, actual %p\n", (where), \ + (void *)_thh->hh_prev, (void *)_prev); \ + } \ + _bkt_count++; \ + _prev = (char *)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("%s: invalid bucket count %u, actual %u\n", (where), \ + (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid hh item count %u, actual %u\n", (where), \ + (head)->hh.tbl->num_items, _count); \ + } \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev != (char *)_thh->prev) { \ + HASH_OOPS("%s: invalid prev %p, actual %p\n", (where), \ + (void *)_thh->prev, (void *)_prev); \ + } \ + _prev = (char *)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid app item count %u, actual %u\n", (where), \ + (head)->hh.tbl->num_items, _count); \ + } \ + } \ + } while (0) +#else +#define HASH_FSCK(hh, head, where) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * * the descriptor to which this macro is defined for tuning the hash + * function. + * * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh, head, keyptr, fieldlen) \ + do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen); \ + } while (0) +#else +#define HASH_EMIT_KEY(hh, head, keyptr, fieldlen) +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. + */ +#define HASH_BER(key, keylen, hashv) \ + do { \ + unsigned _hb_keylen = (unsigned)keylen; \ + const unsigned char *_hb_key = (const unsigned char *)(key); \ + (hashv) = 0; \ + while (_hb_keylen-- != 0U) { \ + (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; \ + } \ + } while (0) + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx + * * (archive link: https://archive.is/Ivcan ) + * */ +#define HASH_SAX(key, keylen, hashv) \ + do { \ + unsigned _sx_i; \ + const unsigned char *_hs_key = (const unsigned char *)(key); \ + hashv = 0; \ + for (_sx_i = 0; _sx_i < keylen; _sx_i++) { \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + } \ + } while (0) +/* FNV-1a variation */ +#define HASH_FNV(key, keylen, hashv) \ + do { \ + unsigned _fn_i; \ + const unsigned char *_hf_key = (const unsigned char *)(key); \ + (hashv) = 2166136261U; \ + for (_fn_i = 0; _fn_i < keylen; _fn_i++) { \ + hashv = hashv ^ _hf_key[_fn_i]; \ + hashv = hashv * 16777619U; \ + } \ + } while (0) + +#define HASH_OAT(key, keylen, hashv) \ + do { \ + unsigned _ho_i; \ + const unsigned char *_ho_key = (const unsigned char *)(key); \ + hashv = 0; \ + for (_ho_i = 0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ + } while (0) + +#define HASH_JEN_MIX(a, b, c) \ + do { \ + a -= b; \ + a -= c; \ + a ^= (c >> 13); \ + b -= c; \ + b -= a; \ + b ^= (a << 8); \ + c -= a; \ + c -= b; \ + c ^= (b >> 13); \ + a -= b; \ + a -= c; \ + a ^= (c >> 12); \ + b -= c; \ + b -= a; \ + b ^= (a << 16); \ + c -= a; \ + c -= b; \ + c ^= (b >> 5); \ + a -= b; \ + a -= c; \ + a ^= (c >> 3); \ + b -= c; \ + b -= a; \ + b ^= (a << 10); \ + c -= a; \ + c -= b; \ + c ^= (b >> 15); \ + } while (0) + +#define HASH_JEN(key, keylen, hashv) \ + do { \ + unsigned _hj_i, _hj_j, _hj_k; \ + unsigned const char *_hj_key = (unsigned const char *)(key); \ + hashv = 0xfeedbeefu; \ + _hj_i = _hj_j = 0x9e3779b9u; \ + _hj_k = (unsigned)(keylen); \ + while (_hj_k >= 12U) { \ + _hj_i += (_hj_key[0] + ((unsigned)_hj_key[1] << 8) + \ + ((unsigned)_hj_key[2] << 16) + ((unsigned)_hj_key[3] << 24)); \ + _hj_j += (_hj_key[4] + ((unsigned)_hj_key[5] << 8) + \ + ((unsigned)_hj_key[6] << 16) + ((unsigned)_hj_key[7] << 24)); \ + hashv += \ + (_hj_key[8] + ((unsigned)_hj_key[9] << 8) + \ + ((unsigned)_hj_key[10] << 16) + ((unsigned)_hj_key[11] << 24)); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12U; \ + } \ + hashv += (unsigned)(keylen); \ + switch (_hj_k) { \ + case 11: \ + hashv += ((unsigned)_hj_key[10] << 24); /* FALLTHROUGH */ \ + case 10: \ + hashv += ((unsigned)_hj_key[9] << 16); /* FALLTHROUGH */ \ + case 9: \ + hashv += ((unsigned)_hj_key[8] << 8); /* FALLTHROUGH */ \ + case 8: \ + _hj_j += ((unsigned)_hj_key[7] << 24); /* FALLTHROUGH */ \ + case 7: \ + _hj_j += ((unsigned)_hj_key[6] << 16); /* FALLTHROUGH */ \ + case 6: \ + _hj_j += ((unsigned)_hj_key[5] << 8); /* FALLTHROUGH */ \ + case 5: \ + _hj_j += _hj_key[4]; /* FALLTHROUGH */ \ + case 4: \ + _hj_i += ((unsigned)_hj_key[3] << 24); /* FALLTHROUGH */ \ + case 3: \ + _hj_i += ((unsigned)_hj_key[2] << 16); /* FALLTHROUGH */ \ + case 2: \ + _hj_i += ((unsigned)_hj_key[1] << 8); /* FALLTHROUGH */ \ + case 1: \ + _hj_i += _hj_key[0]; /* FALLTHROUGH */ \ + default:; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + } while (0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) || \ + defined(_MSC_VER) || defined(__BORLANDC__) || defined(__TURBOC__) +#define get16bits(d) (*((const uint16_t *)(d))) +#endif + +#if !defined(get16bits) +#define get16bits(d) \ + ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) + \ + (uint32_t)(((const uint8_t *)(d))[0])) +#endif +#define HASH_SFH(key, keylen, hashv) \ + do { \ + unsigned const char *_sfh_key = (unsigned const char *)(key); \ + uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen; \ + \ + unsigned _sfh_rem = _sfh_len & 3U; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabeu; \ + \ + /* Main loop */ \ + for (; _sfh_len > 0U; _sfh_len--) { \ + hashv += get16bits(_sfh_key); \ + _sfh_tmp = ((uint32_t)(get16bits(_sfh_key + 2)) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2U * sizeof(uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: \ + hashv += get16bits(_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= (uint32_t)(_sfh_key[sizeof(uint16_t)]) << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: \ + hashv += get16bits(_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: \ + hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + break; \ + default:; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ + } while (0) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl, hh, head, keyptr, keylen_in, hashval, out) \ + do { \ + if ((head).hh_head != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head)); \ + } else { \ + (out) = NULL; \ + } \ + while ((out) != NULL) { \ + if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) { \ + if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) { \ + break; \ + } \ + } \ + if ((out)->hh.hh_next != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next)); \ + } else { \ + (out) = NULL; \ + } \ + } \ + } while (0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head, hh, addhh, oomed) \ + do { \ + UT_hash_bucket *_ha_head = &(head); \ + _ha_head->count++; \ + (addhh)->hh_next = _ha_head->hh_head; \ + (addhh)->hh_prev = NULL; \ + if (_ha_head->hh_head != NULL) { \ + _ha_head->hh_head->hh_prev = (addhh); \ + } \ + _ha_head->hh_head = (addhh); \ + if ((_ha_head->count >= \ + ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) && \ + !(addhh)->tbl->noexpand) { \ + HASH_EXPAND_BUCKETS(addhh, (addhh)->tbl, oomed); \ + IF_HASH_NONFATAL_OOM(if (oomed) { HASH_DEL_IN_BKT(head, addhh); }) \ + } \ + } while (0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(head, delhh) \ + do { \ + UT_hash_bucket *_hd_head = &(head); \ + _hd_head->count--; \ + if (_hd_head->hh_head == (delhh)) { \ + _hd_head->hh_head = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_prev) { \ + (delhh)->hh_prev->hh_next = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_next) { \ + (delhh)->hh_next->hh_prev = (delhh)->hh_prev; \ + } \ + } while (0) + +/* Bucket expansion has the effect of doubling the number of buckets + * * and redistributing the items into the new buckets. Ideally the + * * items will distribute more or less evenly into the new buckets + * * (the extent to which this is true is a measure of the quality of + * * the hash function as it applies to the key domain). + * * + * * With the items distributed into more buckets, the chain length + * * (item count) in each bucket is reduced. Thus by expanding buckets + * * the hash keeps a bound on the chain length. This bounded chain + * * length is the essence of how a hash provides constant time lookup. + * * + * * The calculation of tbl->ideal_chain_maxlen below deserves some + * * explanation. First, keep in mind that we're calculating the + * ideal + * * maximum chain length based on the *new* (doubled) bucket + * count. + * * In fractions this is just n/b (n=number of items,b=new num + * buckets). + * * Since the ideal chain length is an integer, we want to + * calculate + * * ceil(n/b). We don't depend on floating point arithmetic in + * this + * * hash, so to calculate ceil(n/b) with integers we could + * write + * * + * * ceil(n/b) = (n/b) + ((n%b)?1:0) + * * + * * and in fact a previous version of this hash did just + * that. + * * But now we have improved things a bit by recognizing + * that b is + * * always a power of two. We keep its base 2 log handy + * (call it lb), + * * so now we can write this with a bit shift and + * logical AND: + * * + * * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * * + * */ +#define HASH_EXPAND_BUCKETS(hh, tbl, oomed) \ + do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket *)uthash_malloc( \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + if (!_he_new_buckets) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero(_he_new_buckets, \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + (tbl)->ideal_chain_maxlen = \ + ((tbl)->num_items >> ((tbl)->log2_num_buckets + 1U)) + \ + ((((tbl)->num_items & (((tbl)->num_buckets * 2U) - 1U)) != 0U) \ + ? 1U \ + : 0U); \ + (tbl)->nonideal_items = 0; \ + for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) { \ + _he_thh = (tbl)->buckets[_he_bkt_i].hh_head; \ + while (_he_thh != NULL) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[_he_bkt]); \ + if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) { \ + (tbl)->nonideal_items++; \ + if (_he_newbkt->count > \ + _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) { \ + _he_newbkt->expand_mult++; \ + } \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head != NULL) { \ + _he_newbkt->hh_head->hh_prev = _he_thh; \ + } \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free((tbl)->buckets, \ + (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + (tbl)->num_buckets *= 2U; \ + (tbl)->log2_num_buckets++; \ + (tbl)->buckets = _he_new_buckets; \ + (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) \ + ? ((tbl)->ineff_expands + 1U) \ + : 0U; \ + if ((tbl)->ineff_expands > 1U) { \ + (tbl)->noexpand = 1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ + } \ + } while (0) + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head, cmpfcn) HASH_SRT(hh, head, cmpfcn) +#define HASH_SRT(hh, head, cmpfcn) \ + do { \ + unsigned _hs_i; \ + unsigned _hs_looping, _hs_nmerges, _hs_insize, _hs_psize, _hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head != NULL) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping != 0U) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p != NULL) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) { \ + _hs_psize++; \ + _hs_q = ((_hs_q->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) \ + : NULL); \ + if (_hs_q == NULL) { \ + break; \ + } \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize != 0U) || \ + ((_hs_qsize != 0U) && (_hs_q != NULL))) { \ + if (_hs_psize == 0U) { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) \ + : NULL); \ + _hs_qsize--; \ + } else if ((_hs_qsize == 0U) || (_hs_q == NULL)) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) \ + : NULL); \ + } \ + _hs_psize--; \ + } else if ((cmpfcn(DECLTYPE(head)( \ + ELMT_FROM_HH((head)->hh.tbl, _hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, \ + _hs_q)))) <= 0) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) \ + : NULL); \ + } \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) \ + : NULL); \ + _hs_qsize--; \ + } \ + if (_hs_tail != NULL) { \ + _hs_tail->next = \ + ((_hs_e != NULL) ? ELMT_FROM_HH((head)->hh.tbl, _hs_e) \ + : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + if (_hs_e != NULL) { \ + _hs_e->prev = \ + ((_hs_tail != NULL) ? ELMT_FROM_HH((head)->hh.tbl, _hs_tail) \ + : NULL); \ + } \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + if (_hs_tail != NULL) { \ + _hs_tail->next = NULL; \ + } \ + if (_hs_nmerges <= 1U) { \ + _hs_looping = 0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2U; \ + } \ + HASH_FSCK(hh, head, "HASH_SRT"); \ + } \ + } while (0) + +/* This function selects items from one hash into another hash. + * * The end result is that the selected items have dual presence + * * in both hashes. There is no copy of the items made; rather + * * they are added into the new hash through a secondary hash + * * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ + do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt = NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh = NULL; \ + ptrdiff_t _dst_hho = ((char *)(&(dst)->hh_dst) - (char *)(dst)); \ + if ((src) != NULL) { \ + for (_src_bkt = 0; _src_bkt < (src)->hh_src.tbl->num_buckets; \ + _src_bkt++) { \ + for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh != NULL; _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + IF_HASH_NONFATAL_OOM(int _hs_oomed = 0;) \ + _dst_hh = (UT_hash_handle *)(void *)(((char *)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh != NULL) { \ + _last_elt_hh->next = _elt; \ + } \ + if ((dst) == NULL) { \ + DECLTYPE_ASSIGN(dst, _elt); \ + HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed); \ + IF_HASH_NONFATAL_OOM(if (_hs_oomed) { \ + uthash_nonfatal_oom(_elt); \ + (dst) = NULL; \ + continue; \ + }) \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh, \ + _hs_oomed); \ + (dst)->hh_dst.tbl->num_items++; \ + IF_HASH_NONFATAL_OOM(if (_hs_oomed) { \ + HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh); \ + HASH_DELETE_HH(hh_dst, dst, _dst_hh); \ + _dst_hh->tbl = NULL; \ + uthash_nonfatal_oom(_elt); \ + continue; \ + }) \ + HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv); \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst, dst, "HASH_SELECT"); \ + } while (0) + +#define HASH_CLEAR(hh, head) \ + do { \ + if ((head) != NULL) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, (head)->hh.tbl->num_buckets * \ + sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } \ + } while (0) + +#define HASH_OVERHEAD(hh, head) \ + (((head) != NULL) \ + ? ((size_t)(((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ + ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ + sizeof(UT_hash_table) + (HASH_BLOOM_BYTELEN))) \ + : 0U) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh, head, el, tmp) \ + for (((el) = (head)), \ + ((*(char **)(&(tmp))) = \ + (char *)((head != NULL) ? (head)->hh.next : NULL)); \ + (el) != NULL; \ + ((el) = (tmp)), ((*(char **)(&(tmp))) = \ + (char *)((tmp != NULL) ? (tmp)->hh.next : NULL))) +#else +#define HASH_ITER(hh, head, el, tmp) \ + for (((el) = (head)), \ + ((tmp) = DECLTYPE(el)((head != NULL) ? (head)->hh.next : NULL)); \ + (el) != NULL; \ + ((el) = (tmp)), \ + ((tmp) = DECLTYPE(el)((tmp != NULL) ? (tmp)->hh.next : NULL))) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh, head) +#define HASH_CNT(hh, head) ((head != NULL) ? ((head)->hh.tbl->num_items) : 0U) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. + * (If + * * the bucket's chain exceeds this length, bucket expansion is + * triggered). + * * However, setting expand_mult to a non-zero value delays + * bucket expansion + * * (that would be triggered by additions to this particular + * bucket) + * * until its chain length reaches a *multiple* of + * HASH_BKT_CAPACITY_THRESH. + * * (The multiplier is simply expand_mult+1). The + * whole idea of this + * * multiplier is to reduce bucket expansions, + * since they are expensive, in + * * situations where we know that a + * particular bucket tends to be overused. + * * It is better to let its chain length + * grow to a longer yet-still-bounded + * * value, than to do an O(n) bucket + * expansion too often. + * */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1u +#define HASH_BLOOM_SIGNATURE 0xb12220f2u + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * * more than ceil(#items/#buckets) items. that's the ideal chain length. + */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * * exceeds the ideal chain maxlen. these items pay the penalty for an + * uneven + * * hash distribution; reaching them in a chain traversal takes + * >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * * afterward, more than half the items in the hash had nonideal chain + * * positions. If this happens on two consecutive expansions we + * inhibit any + * * further expansion, as it's not helping; this happens when the + * hash + * * function isn't a good fit for the key domain. When + * expansion is inhibited + * * the hash will still work, albeit no longer in + * constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + uint8_t bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + const void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..a6343a9 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.10) +project(memory-accelerator C) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_subdirectory(shm_malloc) +add_subdirectory(posix) +add_subdirectory(controller) +add_subdirectory(client) +add_subdirectory(shared) + +set(SHARED_SOURCE_FILES ${SHARED_SOURCE_FILES} PARENT_SCOPE) +set(CLIENT_SOURCE_FILES ${CLIENT_SOURCE_FILES} PARENT_SCOPE) +set(CONTROLLER_SOURCE_FILES ${CONTROLLER_SOURCE_FILES} PARENT_SCOPE) + +add_compile_options(-Wall -Wextra -pedantic -Werror) diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt new file mode 100644 index 0000000..6b10a84 --- /dev/null +++ b/src/client/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.10) +project(memory-accelerator C) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_compile_options(-Wall -Wextra -pedantic -Werror) + +set(CLIENT_SOURCE_FILES ${CLIENT_SOURCE_FILES} + ${CMAKE_CURRENT_SOURCE_DIR}/client_cleanup.c + ${CMAKE_CURRENT_SOURCE_DIR}/client_init.c + ${CMAKE_CURRENT_SOURCE_DIR}/client_memory.c + ${CMAKE_CURRENT_SOURCE_DIR}/client_place_requests.c + ${CMAKE_CURRENT_SOURCE_DIR}/client_read_location.c + PARENT_SCOPE) diff --git a/src/client/client_cleanup.c b/src/client/client_cleanup.c new file mode 100644 index 0000000..5bd8f85 --- /dev/null +++ b/src/client/client_cleanup.c @@ -0,0 +1,29 @@ +#include "client_cleanup.h" +#include "posix_sm.h" +#include "request.h" + +#include "shm_malloc.h" + +#include + +void cleanup_queues(struct client *client) { + request_queue_deactivate(client->shared_requests); + request_queue_deactivate(client->shared_completions); + + client->shared_requests = NULL; + client->shared_completions = NULL; +} + +void cleanup_shared_mem(struct client *client) { + scoria_sm_unmap(client->shared_location, sizeof(struct memory_location), + "client:unmap"); + + close(client->fd_location); + close(client->fd_requests); + close(client->fd_completions); +} + +void cleanup(struct client *client) { + cleanup_queues(client); + cleanup_shared_mem(client); +} diff --git a/src/client/client_init.c b/src/client/client_init.c new file mode 100644 index 0000000..a18a21e --- /dev/null +++ b/src/client/client_init.c @@ -0,0 +1,135 @@ +#include "client_init.h" +#include "client_read_location.h" + +#include "client.h" +#include "config.h" +#include "posix_sm.h" +#include "request.h" +#include "utils.h" + +#include "shm_malloc.h" + +#include +#include +#include +#include + +void init_memory_pool(struct client *client) { + if (shm_init(SHARED_MEMORY_NAME, setup) < 0) + scoria_error("Client:shm_init"); + + client->shared_mem_ptr = shm_global(); + + if (client->chatty) { + printf("Client: Mapped Shared Memory Address: %p %p\n", + (void *)client->shared_mem_ptr, + (void *)client->shared_location->shared_mem_ptr); + + if (client->shared_mem_ptr == client->shared_location->shared_mem_ptr) + printf("Client: Successfully Mapped Shared Memory Address\n"); + else + // TODO: Error handling + scoria_error("Client: Mapped Shared Memory to Incorrect Address\n"); + } +} + +void init_requests(struct client *client) { + client->fd_requests = + scoria_sm_open(SHARED_REQUESTS_NAME, O_RDWR, 0, "client:shm_open"); + client->shared_requests_list = + scoria_sm_map(client->shared_location->shared_requests_list, + sizeof(struct request_queue_list), PROT_READ | PROT_WRITE, + MAP_SHARED, client->fd_requests, 0, "client:mmap"); + + if (client->chatty) { + if (client->shared_requests_list == + client->shared_location->shared_requests_list) + printf("Client: Successfully Mapped Shared Request Queue List to " + "Address: %p %p\n", + (void *)client->shared_requests_list, + (void *)client->shared_location->shared_requests_list); + else + // TODO: Error handling + scoria_error( + "Client: Mapped Shared Request Queue List to Incorrect Address\n"); + } +} + +void init_completions(struct client *client) { + client->fd_completions = + scoria_sm_open(SHARED_COMPLETIONS_NAME, O_RDWR, 0, "client:shm_open"); + client->shared_completions_list = + scoria_sm_map(client->shared_location->shared_completions_list, + sizeof(struct request_queue_list), PROT_READ | PROT_WRITE, + MAP_SHARED, client->fd_completions, 0, "client:mmap"); + + if (client->chatty) { + if (client->shared_completions_list == + client->shared_location->shared_completions_list) + printf("Client: Successfully Mapped Shared Completion Queue List to " + "Address: %p " + "%p\n", + (void *)client->shared_completions_list, + (void *)client->shared_location->shared_completions_list); + else + // TODO: Error handling + scoria_error("Client: Mapped Shared Completiong Queue List to Incorrect " + "Address\n"); + } +} + +void init_virtual_address_mailbox(struct client *client) { + client->fd_location = + scoria_sm_open(SHARED_LOCATION_NAME, O_RDWR, 0, "client:shm_open"); + client->shared_location = scoria_sm_map( + NULL, sizeof(struct memory_location), PROT_READ | PROT_WRITE, MAP_SHARED, + client->fd_location, 0, "client:mmap"); +} + +void init_id(struct client *client) { + int id = -1; + + for (int i = 0; i < MAX_CLIENTS; i++) { + if (client->shared_requests_list->queues[i].active == 0) { + assert(client->shared_requests_list->queues[i].client == -1); + + assert(client->shared_completions_list->queues[i].active == 0); + assert(client->shared_completions_list->queues[i].client == -1); + + // TODO: Thread Safe + request_queue_activate(&(client->shared_requests_list->queues[i]), i); + request_queue_activate(&(client->shared_completions_list->queues[i]), i); + + client->shared_requests = &(client->shared_requests_list->queues[i]); + client->shared_completions = + &(client->shared_completions_list->queues[i]); + + id = i; + break; + } + } + + if (id == -1) + scoria_error("Client: Exceeded Maxmimum Number of Clients\n"); + + client->id = id; + + if (client->chatty) + printf("Client: Assigned ID %d\n", client->id); +} + +void init(struct client *client) { + init_virtual_address_mailbox(client); + + read_location(client); + + init_memory_pool(client); + init_requests(client); + init_completions(client); + + init_id(client); + + client->unmatched_requests = NULL; + + printf("Client(%d): Connected to Controller Successfully\n", client->id); +} diff --git a/src/client/client_memory.c b/src/client/client_memory.c new file mode 100644 index 0000000..b55a346 --- /dev/null +++ b/src/client/client_memory.c @@ -0,0 +1,84 @@ +#include "client_memory.h" + +#include "client.h" +#include "config.h" +#include "request.h" + +#include +#include + +static int rid = 0; + +void scoria_put_request(struct client *client, struct request *req) { + request_queue_put(client->shared_requests, req); + + if (client->chatty) + printf("Client(%d): Added Request %d:%d to Request Queue %d\n", client->id, + req->client, req->id, client->id); +} + +void scoria_quit(struct client *client, struct request *req) { + if (client->chatty) + printf("Client(%d): Quit Request\n", client->id); + + req->client = client->id; + req->r_type = Quit; + req->id = rid; + rid++; + + if (client->chatty) + printf("Client(%d): Created Request Object: Client: %d ID: %d Type: %d\n", + client->id, req->client, req->id, req->r_type); + + scoria_put_request(client, req); +} + +void scoria_read(struct client *client, void *buffer, const size_t N, + void *output, const size_t *ind1, const size_t *ind2, + size_t num_threads, bool use_avx, struct request *req) { + if (client->chatty) + printf("Client(%d): Reading Buffer\n", client->id); + + req->client = client->id; + req->r_type = Read; + req->input = buffer; + req->output = output; + req->N = N; + req->ind1 = ind1; + req->ind2 = ind2; + req->nthreads = num_threads; + req->use_avx = use_avx; + req->id = rid; + rid++; + + if (client->chatty) + printf("Client(%d): Create Request Object: Client: %d ID: %d Type: %d\n", + client->id, req->client, req->id, req->r_type); + + scoria_put_request(client, req); +} + +void scoria_write(struct client *client, void *buffer, const size_t N, + void *input, const size_t *ind1, const size_t *ind2, + size_t num_threads, bool use_avx, struct request *req) { + if (client->chatty) + printf("Client(%d): Writing Buffer\n", client->id); + + req->client = client->id; + req->r_type = Write; + req->output = buffer; + req->input = input; + req->N = N; + req->ind1 = ind1; + req->ind2 = ind2; + req->nthreads = num_threads; + req->use_avx = use_avx; + req->id = rid; + rid++; + + if (client->chatty) + printf("Client(%d): Created Request Object: Client: %d ID: %d Type: %d\n", + client->id, req->client, req->id, req->r_type); + + scoria_put_request(client, req); +} diff --git a/src/client/client_place_requests.c b/src/client/client_place_requests.c new file mode 100644 index 0000000..8de3ddd --- /dev/null +++ b/src/client/client_place_requests.c @@ -0,0 +1,107 @@ +#include "client_place_requests.h" +#include "client_memory.h" + +#include "client.h" +#include "config.h" +#include "request.h" + +#include "shm_malloc.h" +#include "uthash.h" + +#include + +void wait_request(struct client *client, struct request *req) { + if (client->chatty) + printf("Client(%d): Waiting on Request %d:%d\n", client->id, req->client, + req->id); + + int found; + int id = req->id; + + struct request *query; + + HASH_FIND_INT(client->unmatched_requests, &id, query); + if (query == NULL) + found = 0; + else + found = 1; + + while (!found) { + struct request complete; + request_queue_fetch(client->shared_completions, &complete); + + if (complete.r_type == Kill) { + printf("Received a Kill Request Originating from a Quit Request from " + "Client(%d)\n", + complete.client); + exit(1); + } + + if (complete.id == id) { + *req = complete; + struct request *find; + + HASH_FIND_INT(client->unmatched_requests, &id, find); + if (find != NULL) + HASH_DEL(client->unmatched_requests, find); + + found = 1; + } else + HASH_ADD_INT(client->unmatched_requests, id, &complete); + } + + if (client->chatty) + printf("Client(%d): Controller Completed Request %d:%d\n", client->id, + req->client, id); +} + +void wait_requests(struct client *client, struct request *reqs, + size_t num_reqs) { + for (size_t i = 0; i < num_reqs; ++i) + wait_request(client, &reqs[i]); +} + +void place_requests(struct client *client) { + // Allocate Buffer + double *A = shm_malloc(1024 * sizeof(int)); + + if (client->chatty) + printf("Client(%d): Received Pointer to Allocated Memory: %p\n", client->id, + (void *)A); + + // Write to Buffer + printf("Client(%d): Writing Array:\n", client->id); + + double *input = shm_malloc(1024 * sizeof(double)); + for (size_t i = 0; i < 1024; ++i) { + input[i] = (double)(2 * i); + } + + struct request req1; + scoria_write(client, A, 1024, input, NULL, NULL, 0, 0, &req1); + wait_request(client, &req1); + shm_free(input); + + // Read from Buffer + printf("Client(%d): Reading Array:\n", client->id); + + double *output = shm_malloc(1024 * sizeof(double)); + + struct request req2; + scoria_read(client, A, 1024, output, NULL, NULL, 0, 0, &req2); + wait_request(client, &req2); + + for (size_t i = 0; i < 1024; ++i) + printf("%.2f ", output[i]); + printf("\n"); + shm_free(output); + + // Free Buffer + shm_free(A); + + // Exit Program + // struct request req3; + // scoria_quit(client, &req3); + + // wait_request(client, &req3); +} diff --git a/src/client/client_read_location.c b/src/client/client_read_location.c new file mode 100644 index 0000000..444eb69 --- /dev/null +++ b/src/client/client_read_location.c @@ -0,0 +1,33 @@ +#include "client_read_location.h" + +#include "client.h" +#include "config.h" + +#include + +void read_location(struct client *client) { + if (client->chatty) + printf("Client: Waiting on Controller\n"); + + while (!client->shared_location->ready) { + ; + ; + } + + client->shared_mem_ptr = client->shared_location->shared_mem_ptr; + client->shared_requests_list = client->shared_location->shared_requests_list; + client->shared_completions_list = + client->shared_location->shared_completions_list; + + if (client->chatty) { + printf("Client: Received Shared Memory Addresses\n"); + printf("Client: shared_mem_ptr %p %p\n", (void *)client->shared_mem_ptr, + (void *)client->shared_location->shared_mem_ptr); + printf("Client: shared_requests_list %p %p\n", + (void *)client->shared_requests_list, + (void *)client->shared_location->shared_requests_list); + printf("Client: shared_completions_list %p %p\n", + (void *)client->shared_completions_list, + (void *)client->shared_location->shared_completions_list); + } +} diff --git a/src/controller/CMakeLists.txt b/src/controller/CMakeLists.txt new file mode 100644 index 0000000..e1c3a99 --- /dev/null +++ b/src/controller/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.10) +project(memory-accelerator C) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_compile_options(-Wall -Wextra -pedantic -Werror) + +set(CONTROLLER_SOURCE_FILES ${CONTROLLER_SOURCE_FILES} + ${CMAKE_CURRENT_SOURCE_DIR}/controller_cleanup.c + ${CMAKE_CURRENT_SOURCE_DIR}/controller_handle_requests.c + ${CMAKE_CURRENT_SOURCE_DIR}/controller_init.c + ${CMAKE_CURRENT_SOURCE_DIR}/controller_write_location.c + PARENT_SCOPE) diff --git a/src/controller/controller_cleanup.c b/src/controller/controller_cleanup.c new file mode 100644 index 0000000..5dd0cfd --- /dev/null +++ b/src/controller/controller_cleanup.c @@ -0,0 +1,40 @@ +#include "controller_cleanup.h" +#include "posix_sm.h" +#include "request.h" +#include "utils.h" + +#include "shm_malloc.h" + +#include + +void cleanup_shared_mem(struct controller *controller) { + request_queue_list_free(controller->shared_requests_list); + request_queue_list_free(controller->shared_completions_list); + + scoria_sm_unmap(controller->shared_location, sizeof(struct memory_location), + "controller:unmap:shared_location"); + scoria_sm_unlink(SHARED_LOCATION_NAME, + "controller:sem_unlink:shared_location"); + + scoria_sm_unmap(controller->shared_requests_list, + sizeof(struct request_queue_list), + "controller:unmap:shared_requests"); + scoria_sm_unlink(SHARED_REQUESTS_NAME, + "controller:sem_unlink:shared_requests"); + + scoria_sm_unmap(controller->shared_completions_list, + sizeof(struct request_queue_list), + "controller:unmap:shared_completions"); + scoria_sm_unlink(SHARED_COMPLETIONS_NAME, + "controller:sem_unlink:shared_completions"); + + close(controller->fd_location); + close(controller->fd_requests); + close(controller->fd_completions); +} + +void cleanup(struct controller *controller) { + cleanup_shared_mem(controller); + + shm_destroy(); +} diff --git a/src/controller/controller_handle_requests.c b/src/controller/controller_handle_requests.c new file mode 100644 index 0000000..fba3693 --- /dev/null +++ b/src/controller/controller_handle_requests.c @@ -0,0 +1,232 @@ +#include "controller_handle_requests.h" + +#include "config.h" +#include "controller.h" +#include "kernels.h" +#include "request.h" +#include "utils.h" + +#include "shm_malloc.h" + +#include + +int quit = 0; +int tid = -1; + +void handle_read(struct controller *controller, struct request_queue *queue, + struct request *req) { + if (controller->chatty) + printf("Controller: Received Request Object: Client: %d ID: %d Type: %d N: " + "%ld\n", + req->client, req->id, req->r_type, req->N); + + if (req->ind1 == NULL) { + assert(req->ind2 == NULL); + if (req->nthreads == 0) { + read_single_thread_0(req->output, req->input, req->N, req->use_avx); + } else { + read_multi_thread_0(req->output, req->input, req->N, req->nthreads, + req->use_avx); + } + + req->r_status = Ready; + + request_queue_put(queue, req); + + if (controller->chatty) + printf("Controller: Client(%d) Read Data with N: %ld\n", req->client, + req->N); + + return; + } + + if (req->ind2 == NULL) { + assert(req->ind1 != NULL); + if (req->nthreads == 0) { + read_single_thread_1(req->output, req->input, req->N, req->ind1, + req->use_avx); + } else { + read_multi_thread_1(req->output, req->input, req->N, req->ind1, + req->nthreads, req->use_avx); + } + + req->r_status = Ready; + + request_queue_put(queue, req); + + if (controller->chatty) + printf("Controller: Client(%d) Read Data with N: %ld\n", req->client, + req->N); + + return; + } + + assert(req->ind1 != NULL); + assert(req->ind2 != NULL); + + if (req->nthreads == 0) { + read_single_thread_2(req->output, req->input, req->N, req->ind1, req->ind2, + req->use_avx); + } else { + read_multi_thread_2(req->output, req->input, req->N, req->ind1, req->ind2, + req->nthreads, req->use_avx); + } + + req->r_status = Ready; + + request_queue_put(queue, req); + + if (controller->chatty) + printf("Controller: Client(%d) Read Data with N: %ld\n", req->client, + req->N); +} + +void handle_write(struct controller *controller, struct request_queue *queue, + struct request *req) { + if (controller->chatty) + printf("Controller: Received Request Object: Client: %d ID: %d Type: %d " + "Pointer: %p Input Pointer: %p N: %ld\n", + req->client, req->id, req->r_type, (void *)req->output, + (void *)req->input, req->N); + + if (req->ind1 == NULL) { + assert(req->ind2 == NULL); + if (req->nthreads == 0) { + write_single_thread_0(req->output, req->input, req->N, req->use_avx); + } else { + write_multi_thread_0(req->output, req->input, req->N, req->nthreads, + req->use_avx); + } + + req->r_status = Ready; + + request_queue_put(queue, req); + + if (controller->chatty) + printf("Controller: Client(%d) Wrote Data with N: %ld\n", req->client, + req->N); + + return; + } + + if (req->ind2 == NULL) { + assert(req->ind1 != NULL); + if (req->nthreads == 0) { + write_single_thread_1(req->output, req->input, req->N, req->ind1, + req->use_avx); + } else { + write_multi_thread_1(req->output, req->input, req->N, req->ind1, + req->nthreads, req->use_avx); + } + + req->r_status = Ready; + + request_queue_put(queue, req); + + if (controller->chatty) + printf("Controller: Client(%d) Wrote Data with N: %ld\n", req->client, + req->N); + + return; + } + + assert(req->ind1 != NULL); + assert(req->ind2 != NULL); + + if (req->nthreads == 0) { + write_single_thread_2(req->output, req->input, req->N, req->ind1, req->ind2, + req->use_avx); + } else { + write_multi_thread_2(req->output, req->input, req->N, req->ind1, req->ind2, + req->nthreads, req->use_avx); + } + + req->r_status = Ready; + + request_queue_put(queue, req); + + if (controller->chatty) + printf("Controller: Client(%d) Wrote Data with N: %ld\n", req->client, + req->N); +} + +void *handler(void *args) { + struct thread_args *a = args; + + size_t i = a->i; + struct controller *controller = a->controller; + + struct request_queue *requests = + &(controller->shared_requests_list->queues[i]); + struct request_queue *completions = + &(controller->shared_completions_list->queues[i]); + + while (!quit) { + struct request req; + request_queue_fetch(requests, &req); + + if (controller->chatty) + printf("Controller: Client (%ld): Request %d Detected\n", i, req.id); + + switch (req.r_type) { + case Read: + handle_read(controller, completions, &req); + break; + case Write: + handle_write(controller, completions, &req); + break; + case Quit: + tid = i; + quit = 1; + req.r_status = Ready; + request_queue_put(completions, &req); + break; + case Kill: + quit = 1; + break; + default: + printf("Controller: Client (%ld): Invalid Request Type Detected\n", i); + tid = i; + quit = 1; + } + } + + return NULL; +} + +void handle_requests(struct controller *controller) { + // Start loop + pthread_t threads[MAX_CLIENTS]; + struct thread_args args[MAX_CLIENTS]; + + for (size_t i = 0; i < MAX_CLIENTS; ++i) { + args[i].i = i; + args[i].controller = controller; + + int ret = pthread_create(&threads[i], NULL, handler, &args[i]); + assert(ret == 0); + } + + while (!quit) { + ; + ; + } + + for (int i = 0; i < MAX_CLIENTS; ++i) { + if (i != tid) { + struct request req; + req.r_type = Kill; + req.id = -1; + req.client = tid; + request_queue_put(&(controller->shared_completions_list->queues[i]), + &req); + request_queue_put(&(controller->shared_requests_list->queues[i]), &req); + } + } + + for (int i = 0; i < MAX_CLIENTS; ++i) { + pthread_join(threads[i], NULL); + } + + printf("Controller: Quit Request Received from Client(%d)\n", tid); +} diff --git a/src/controller/controller_init.c b/src/controller/controller_init.c new file mode 100644 index 0000000..eecea79 --- /dev/null +++ b/src/controller/controller_init.c @@ -0,0 +1,94 @@ +#include "controller_init.h" +#include "controller_write_location.h" + +#include "config.h" +#include "controller.h" +#include "posix_sm.h" +#include "request.h" +#include "utils.h" + +#include "shm_malloc.h" + +#include +#include +#include +#include + +void init_files() { + if (access(SHARED_MEMORY_NAME, F_OK) == 0) + if (unlink(SHARED_MEMORY_NAME) == -1) + scoria_error("controller:unlink:shared_memory"); +} + +void init_memory_pool(struct controller *controller) { + if (shm_init(SHARED_MEMORY_NAME, setup) < 0) + scoria_error("Controller:shm_init"); + + controller->shared_mem_ptr = shm_global(); +} + +void init_requests(struct controller *controller) { + controller->fd_requests = + scoria_sm_open(SHARED_REQUESTS_NAME, O_RDWR | O_CREAT | O_TRUNC, 0660, + "controller:shm_open"); + scoria_sm_truncate(controller->fd_requests, sizeof(struct request_queue_list), + "controller:ftruncate"); + controller->shared_requests_list = scoria_sm_map( + NULL, sizeof(struct request_queue_list), PROT_READ | PROT_WRITE, + MAP_SHARED, controller->fd_requests, 0, "controller:mmap"); + + request_queue_list_init(controller->shared_requests_list); + + if (controller->chatty) + printf("Controller: Shared Request Address: %p\n", + (void *)controller->shared_requests_list); +} + +void init_completions(struct controller *controller) { + controller->fd_completions = + scoria_sm_open(SHARED_COMPLETIONS_NAME, O_RDWR | O_CREAT | O_TRUNC, 0660, + "controller:shm_open"); + scoria_sm_truncate(controller->fd_completions, + sizeof(struct request_queue_list), "controller:ftruncate"); + controller->shared_completions_list = scoria_sm_map( + NULL, sizeof(struct request_queue_list), PROT_READ | PROT_WRITE, + MAP_SHARED, controller->fd_completions, 0, "controller:mmap"); + + request_queue_list_init(controller->shared_completions_list); + + if (controller->chatty) + printf("Controller: Shared Completions Address: %p\n", + (void *)controller->shared_completions_list); +} + +void init_virtual_address_mailbox(struct controller *controller) { + controller->fd_location = + scoria_sm_open(SHARED_LOCATION_NAME, O_RDWR | O_CREAT | O_TRUNC, 0660, + "controller:shm_open"); + scoria_sm_truncate(controller->fd_location, sizeof(struct memory_location), + "controller:ftruncate"); + controller->shared_location = scoria_sm_map( + NULL, sizeof(struct memory_location), PROT_READ | PROT_WRITE, MAP_SHARED, + controller->fd_location, 0, "controller:mmap"); + + controller->shared_location->ready = 0; + controller->shared_location->shared_mem_ptr = NULL; + controller->shared_location->shared_requests_list = NULL; + controller->shared_location->shared_completions_list = NULL; + + if (controller->chatty) + printf("Controller: Shared Location: %p\n", + (void *)controller->shared_location); +} + +void init(struct controller *controller) { + init_files(); + + init_virtual_address_mailbox(controller); + + init_memory_pool(controller); + init_requests(controller); + init_completions(controller); + + write_location(controller); +} diff --git a/src/controller/controller_write_location.c b/src/controller/controller_write_location.c new file mode 100644 index 0000000..27f8771 --- /dev/null +++ b/src/controller/controller_write_location.c @@ -0,0 +1,29 @@ +#include "controller_write_location.h" + +#include "config.h" +#include "controller.h" + +#include + +void write_location(struct controller *controller) { + controller->shared_location->shared_mem_ptr = controller->shared_mem_ptr; + controller->shared_location->shared_requests_list = + controller->shared_requests_list; + controller->shared_location->shared_completions_list = + controller->shared_completions_list; + + controller->shared_location->ready = 1; + + if (controller->chatty) { + printf("Controller: Posted Shared Memory Addresses\n"); + printf("Controller: shared_mem_ptr %p %p\n", + (void *)controller->shared_location->shared_mem_ptr, + (void *)controller->shared_mem_ptr); + printf("Controller: shared_requests_list %p %p\n", + (void *)controller->shared_location->shared_requests_list, + (void *)controller->shared_requests_list); + printf("Controller: shared_completions_list %p %p\n", + (void *)controller->shared_location->shared_completions_list, + (void *)controller->shared_completions_list); + } +} diff --git a/src/posix/CMakeLists.txt b/src/posix/CMakeLists.txt new file mode 100644 index 0000000..745ff76 --- /dev/null +++ b/src/posix/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.10) +project(memory-accelerator C) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_compile_options(-Wall -Wextra -pedantic -Werror) + +set(SHARED_SOURCE_FILES ${SHARED_SOURCE_FILES} + ${CMAKE_CURRENT_SOURCE_DIR}/posix_sm.c + PARENT_SCOPE) diff --git a/src/posix/posix_sm.c b/src/posix/posix_sm.c new file mode 100644 index 0000000..efb1277 --- /dev/null +++ b/src/posix/posix_sm.c @@ -0,0 +1,43 @@ +#include "posix_sm.h" +#include "config.h" +#include "utils.h" + +#include +#include +#include +#include + +int scoria_sm_open(const char *name, int oflag, mode_t mode, const char *msg) { + int fd; + + if ((fd = shm_open(name, oflag, mode)) == -1) + scoria_error(msg); + + return fd; +} + +void scoria_sm_unlink(const char *name, const char *msg) { + if (shm_unlink(name) == -1) + scoria_error(msg); +} + +void scoria_sm_truncate(const int fd, const size_t length, const char *msg) { + if (ftruncate(fd, length) == -1) + scoria_error(msg); +} + +void *scoria_sm_map(void *addr, const size_t length, const int prot, + const int flags, const int fd, const off_t offset, + const char *msg) { + void *ptr; + + if ((ptr = mmap(addr, length, prot, flags, fd, offset)) == MAP_FAILED) + scoria_error(msg); + + return ptr; +} + +void scoria_sm_unmap(void *ptr, const size_t length, const char *msg) { + if (munmap(ptr, length) == -1) + scoria_error(msg); +} diff --git a/src/shared/CMakeLists.txt b/src/shared/CMakeLists.txt new file mode 100644 index 0000000..a988b60 --- /dev/null +++ b/src/shared/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.10) +project(memory-accelerator C) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_compile_options(-Wall -Wextra -pedantic -Werror) + +set(SHARED_SOURCE_FILES ${SHARED_SOURCE_FILES} + ${CMAKE_CURRENT_SOURCE_DIR}/request.c + ${CMAKE_CURRENT_SOURCE_DIR}/utils.c + PARENT_SCOPE) diff --git a/src/shared/request.c b/src/shared/request.c new file mode 100644 index 0000000..e8d6f9b --- /dev/null +++ b/src/shared/request.c @@ -0,0 +1,130 @@ +#include "request.h" + +#include "shm_malloc.h" + +#include +#include +#include +#include + +void request_queue_init(request_queue *rq) { + rq->client = -1; + rq->active = 0; + + rq->begin = &(rq->requests[0]); + rq->end = &(rq->requests[REQUEST_QUEUE_SIZE - 1]); + + rq->head = rq->begin; + rq->tail = rq->begin; + + rq->count = 0; + rq->size = sizeof(struct request); + + rq->capacity = REQUEST_QUEUE_SIZE; + + pthread_mutexattr_init(&(rq->attr_lock)); + pthread_mutexattr_setpshared(&(rq->attr_lock), PTHREAD_PROCESS_SHARED); + + pthread_mutex_init(&(rq->lock), &(rq->attr_lock)); + + pthread_condattr_init(&(rq->attr_empty)); + pthread_condattr_setpshared(&(rq->attr_empty), PTHREAD_PROCESS_SHARED); + + pthread_cond_init(&(rq->empty), &(rq->attr_empty)); + + pthread_condattr_init(&(rq->attr_fill)); + pthread_condattr_setpshared(&(rq->attr_fill), PTHREAD_PROCESS_SHARED); + + pthread_cond_init(&(rq->fill), &(rq->attr_fill)); +} + +void request_queue_free(request_queue *rq) { + pthread_mutex_destroy(&(rq->lock)); + pthread_mutexattr_destroy(&(rq->attr_lock)); + + pthread_cond_destroy(&(rq->empty)); + pthread_condattr_destroy(&(rq->attr_empty)); + + pthread_cond_destroy(&(rq->fill)); + pthread_condattr_destroy(&(rq->attr_fill)); +} + +void request_queue_put(request_queue *rq, const struct request *item) { + pthread_mutex_lock(&(rq->lock)); + + while (rq->count == rq->capacity) + pthread_cond_wait(&(rq->empty), &(rq->lock)); + + memcpy(rq->head, item, rq->size); + + rq->head = rq->head + 1; + + if (rq->head == rq->end) + rq->head = rq->begin; + + rq->count++; + + pthread_cond_signal(&(rq->fill)); + pthread_mutex_unlock(&(rq->lock)); +} + +void request_queue_fetch(request_queue *rq, struct request *item) { + pthread_mutex_lock(&(rq->lock)); + + while (rq->count == 0) + pthread_cond_wait(&(rq->fill), &(rq->lock)); + + memcpy(item, rq->tail, rq->size); + + rq->tail = rq->tail + 1; + + if (rq->tail == rq->end) + rq->tail = rq->begin; + + rq->count--; + + pthread_cond_signal(&(rq->empty)); + pthread_mutex_unlock(&(rq->lock)); +} + +void request_queue_activate(request_queue *rq, int id) { + pthread_mutex_lock(&(rq->lock)); + + assert(rq->client == -1); + assert(rq->active == 0); + + rq->client = id; + rq->active = 1; + + pthread_mutex_unlock(&(rq->lock)); +} + +void request_queue_deactivate(request_queue *rq) { + pthread_mutex_lock(&(rq->lock)); + + rq->client = -1; + rq->active = 0; + + rq->begin = &(rq->requests[0]); + rq->end = &(rq->requests[REQUEST_QUEUE_SIZE - 1]); + + rq->head = rq->begin; + rq->tail = rq->begin; + + rq->count = 0; + rq->size = sizeof(struct request); + + rq->capacity = REQUEST_QUEUE_SIZE; + + pthread_mutex_unlock(&(rq->lock)); +} + +void request_queue_list_init(request_queue_list *rql) { + for (size_t i = 0; i < MAX_CLIENTS; ++i) + request_queue_init(&(rql->queues[i])); +} + +void request_queue_list_free(request_queue_list *rql) { + for (size_t i = 0; i < MAX_CLIENTS; ++i) + request_queue_free(&(rql->queues[i])); +} diff --git a/src/shared/utils.c b/src/shared/utils.c new file mode 100644 index 0000000..4eefb77 --- /dev/null +++ b/src/shared/utils.c @@ -0,0 +1,27 @@ +#include "utils.h" +#include "config.h" + +#include "shm_malloc.h" + +#include +#include +#include + +void setup() { + struct shared_memory *shared_mem_ptr = + shm_malloc(sizeof(struct shared_memory)); + + if (!shared_mem_ptr) { + // TODO: Handle Error + } + + shared_mem_ptr->head = 0; + shared_mem_ptr->tail = &shared_mem_ptr->head; + + shm_set_global(shared_mem_ptr); +} + +void scoria_error(const char *msg) { + perror(msg); + exit(1); +} diff --git a/src/shm_malloc/CMakeLists.txt b/src/shm_malloc/CMakeLists.txt new file mode 100644 index 0000000..127060f --- /dev/null +++ b/src/shm_malloc/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.10) +project(shm_malloc C) + +add_library(shm STATIC malloc.c malloc.h) +target_compile_definitions(shm PUBLIC SHM) +target_include_directories(shm PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/src/shm_malloc/README b/src/shm_malloc/README new file mode 100644 index 0000000..dd0b706 --- /dev/null +++ b/src/shm_malloc/README @@ -0,0 +1,130 @@ +NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY + +THIS CODE IS RELESED AS IS, WITH NO WARRANTY OF ANY KIND. + +NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY - NO WARRANTY + + +These malloc routines are based primarily on the paper "Efficient Kernel +Memory Allocation on Shared-Memory Multiprocessors", by McKenney and +Slingwine, which appears in the USENIX Winter 1993 conference proceedings. + +The basic idea is to reduce the number of interlocks required by using +small per-process free lists. Interlocks are only required when the +free lists are empty or overflow. + +This per-process free list idea is used only for blocks smaller than 1 +page. For these small blocks, all allocations are rounded up to a +power-of-two size and one free list is used for each size. Blocks are +allocated by splitting an entire page into blocks of the same size, so +that no extra per-block storage is required for in-use blocks. + +Larger allocations are always rounded up to a multiple of the page size. +Free pages are kept in a sorted list, and a best-fit allocation scheme +is used. + +When compiled directly, the source will produce malloc replacement code. + +When compiled with -DMALLOC_DEBUG, it will add in extra guard and consistency +checks as an aid to debugging heap corruption problems. + +When compiled with -DSHM, the source will produce a shared-memory malloc +package, which uses mmap to get memory from the system (so it may also +be used as persistent storage). There are a number of important routines. + +void *shm_malloc(size_t); +void *shm_calloc(int, int); +void *shm_realloc(void *, size_t); +void shm_free(void *); + These are used instead of the standard malloc routines. + +void shm_init(char *filename [, void (*init)()]); + This must be called in each process to get access to the shared memory. + If the file doesn't (yet) exist, it will be created and initialized to + `all free'. It contains code to avoid race conditions, so you can call + it in multiple processes simultaneously and the right thing will happen + + The optional init function will be called if this is a newly created + file. Only one process will call this function if multiple processes + call shm_init on a new file simultaneously; the others will wait for it + to finish. + +void shm_fini(); + This must be called in each process before exiting (unless shm_destroy + is called). It frees up all the process internal tables and `disconnects' + from the shared memory. + +void shm_destroy(); + This works as shm_fini and also removes the shared memory file. Other + processes which are trying to use the shared memory may fail mysteriously + after this is called. + +void shm_child(); + This should be called in the child process if a process calls fork + after it calls shm_init, before the child process calls any other + shared memory routine. The child can safely call exit or exec + without calling shm_child, as long as it calls no shared memory + routines before then. + +void *shm_global(); +void shm_set_global(void *); + These two routines give all process who have mapped a shared memory file + (with shm_init) acces to a single global variable. Usually a pointer + into the shared memory. Initialized to 0 when shm_init is called for + a non-existant file. + +The code is arranged so that all processes will map the memory at the same +address. This precludes the possibility of mapping multiple shared memory +files into one process, but its almost impossible to work without it. The +shared memory load address is a constant that will need to be changed for +any port. + +You have a choice of 4 different locking schemes for the global tables: +System V semaphores, File locks, atomic test-and-set spin locks, and +pthreads mutexes. You can choose between them by compiling with one of: + -DLOCKTYPE=SYSVSEM + -DLOCKTYPE=FLOCK + -DLOCKTYPE=SPINLOCK + -DLOCKTYPE=PMUTEX +Put this into the `CFLAGS' in the Makefile, or use it on the command line +when compiling malloc.c manually. The default is posix mutexes on posix +systems, otherwise spinlocks if it is supported, otherwise file locks. +Currently spinlocks are only supported when using GCC on the following +processors: + m68k + m88k + sparc + alpha + i386 + ppc + x86_64 +To add support for an additional processor, you'll need to add appropriate +macros to `atomic.h'. + +I've tested this code on the following machines: + sparc SunOS 4 + sparc SunOS 5 + mips IRIX 5 + alpha OSF/1 + i586 FreeBSD 2.1.5 + rs6k AIX 4.2 + Linux 2.6.38 + +On FreeBSD and AIX, the code does not work if the shared memory file is on +an NFS-mounted partition. On all systems, its a bit slower to use an NFS +file, so you're better off using a local file if possible. + +Note that the fact that you can use a file on a NFS server does NOT mean +that you can use this code for distributed shared memory -- it won't +work. In general ANY TIME two different client machines try to modify +the same file on a NFS server, all bets are off and everything will +break. NFS is an abbreviation for `Not a File System.' + +On hppa HP-UX 9 machines, this code is known to crash the machine. On +i86 FreeBSD 2.0 it also frequently crashes the machine. + +I'd appreciate reciving a copy of any bug-fixes, enhancements, or ports +anyone makes with this code. + +Chris Dodd +dodd@csl.sri.com diff --git a/src/shm_malloc/atomic.h b/src/shm_malloc/atomic.h new file mode 100644 index 0000000..67b91f6 --- /dev/null +++ b/src/shm_malloc/atomic.h @@ -0,0 +1,250 @@ +#ifndef __atomic_h__ +#define __atomic_h__ + +#ifndef __GNUC__ +#error This file requires GCC +#else + +#if defined(mc68000) +typedef unsigned int word_t; + +typedef unsigned int TAS_t; +#define TAS(m) \ + ({ \ + register TAS_t _t_tas; \ + asm volatile("tas (%1); smi %0" : "=g"(_t_tas) : "a"(&(m))); \ + _t_tas; \ + }) + +#if defined(mc68020) +#define CAS(m, c, u) \ + ({ \ + register word_t _o; \ + asm volatile("cas %0,%1,(%2)" : "=d"(_o) : "d"(u), "a"(&(m)), "0"(c)); \ + _o; \ + }) +#define CAS2(m1, c1, u1, m2, c2, u2) \ + asm volatile("cas2 %0:%1,%2:%3,(%4):(%5)" \ + : "=d"(c1), "=d"(c2) \ + : "d"(u1), "d"(u2), "g"(&(m1)), "g"(&(m2)), "0"(c1), "1"(c2)) +#endif /* mc68020 */ + +#elif defined(__i386__) +typedef unsigned int word_t; +#define SWAP(m, v) \ + ({ \ + register word_t _o; \ + asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m)); \ + _o; \ + }) +#define SWAPB(m, v) \ + ({ \ + register unsigned char _o; \ + asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m)); \ + _o; \ + }) + +#elif defined(__x86_64__) +typedef unsigned long word_t; +#define SWAP(m, v) \ + ({ \ + register word_t _o; \ + asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m)); \ + _o; \ + }) +#define SWAPB(m, v) \ + ({ \ + register unsigned char _o; \ + asm volatile("xchg %0, %2" : "=r"(_o) : "0"(v), "m"(m)); \ + _o; \ + }) + +#elif defined(sparc) +typedef unsigned int word_t; + +#define SWAP(m, v) \ + ({ \ + register word_t _o; \ + asm volatile("swap [%2],%0" : "=r"(_o) : "0"(v), "r"(&(m))); \ + _o; \ + }) + +#define SWAPB(m, v) \ + ({ \ + register word_t _o; \ + asm volatile("ldstub [%2],%0" : "=r"(_o) : "0"(v), "r"(&(m))); \ + _o; \ + }) + +#elif defined(m88k) +typedef unsigned int word_t; + +#define SWAP(m, v) \ + ({ \ + register word_t _o; \ + asm volatile("xmem %0,%1,0" : "=r"(_o) : "r"(&(m)), "0"(v)); \ + _o; \ + }) + +#elif defined(__alpha__) +typedef unsigned long word_t; + +#define RW_NONSTRICT 1 +#define MEMORY_BARRIER asm volatile("mb") +#define LOAD_LOCK(m) \ + ({ \ + register word_t _o; \ + asm volatile("ldq_l %0,%1" : "=r"(_o) : "m"(m)); \ + _o; \ + }) +#define STORE_LOCK(m, v) \ + ({ \ + register word_t _o; \ + asm volatile("stq_c %0,%1" : "=r"(_o) : "m"(m), "0"(v)); \ + _o; \ + }) + +#elif defined(__ppc__) +typedef unsigned long word_t; + +#define RW_NONSTRICT 1 +#define MEMORY_BARRIER asm volatile("eieio") +#define LOAD_LOCK(m) \ + ({ \ + register word_t _o; \ + asm volatile("lwarx %0,0,%1" : "=r"(_o) : "r"(&(m))); \ + _o; \ + }) +#define STORE_LOCK(m, v) \ + ({ \ + register int _o = 0; \ + asm volatile("stwcx. %2,0,%1\n" \ + "\tbc 5,2,$+8\n" \ + "\tori %0,%0,1" \ + : "=r"(_o) \ + : "r"(&(m)), "r"(v), "0"(_o)); \ + _o; \ + }) + +#else +#error Unknown machine type +#endif + +#if !defined(MEMORY_BARRIER) +#define MEMORY_BARRIER +#endif /* !MEMORY_BARRIER */ + +#if !defined(ATOMSET) +#define ATOMSET(m, v) \ + ({ \ + register word_t _o; \ + MEMORY_BARRIER; \ + _o = (m) = (v); \ + MEMORY_BARRIER; \ + _o; \ + }) +#endif /* !ATOMSET */ + +#if defined(LOAD_LOCK) && !defined(TAS) +typedef word_t TAS_t; +#define TAS(m) \ + ({ \ + register word_t *_m = (word_t *)&(m); \ + LOAD_LOCK(*_m) ? 1 : !STORE_LOCK(*_m, 1); \ + }) +#endif + +#if defined(LOAD_LOCK) && !defined(CAS) +#define CAS(m, c, u) \ + ({ \ + register word_t _o, _t = (u); \ + register word_t *_m = (word_t *)&(m); \ + do { \ + if ((_o = LOAD_LOCK(*_m)) != (c)) \ + break; \ + } while (!STORE_LOCK(*_m, _t)); \ + _o; \ + }) +#endif + +#if defined(LOAD_LOCK) && !defined(SWAP) +#define SWAP(m, v) \ + ({ \ + register word_t _o, _v = (v); \ + register word_t *_m = (word_t *)&(m); \ + do { \ + _o = LOAD_LOCK(*_m); \ + } while (!STORE_LOCK(*_m, _v)); \ + _o; \ + }) +#endif + +#if defined(LOAD_LOCK) && !defined(ATOMADD) +#define ATOMADD(m, v) \ + ({ \ + register word_t _o, _v = (v); \ + register word_t *_m = (word_t *)&(m); \ + do { \ + _o = LOAD_LOCK(*_m) + _v; \ + } while (!STORE_LOCK(*_m, _o)); \ + _o; \ + }) +#endif + +#if defined(SWAPB) && !defined(TAS) +typedef unsigned char TAS_t; +#define TAS(m) SWAPB(m, 1) +#endif + +#if defined(SWAP) && !defined(TAS) +typedef word_t TAS_t; +#define TAS(m) SWAP(m, 1) +#endif + +#if defined(CAS) && !defined(TAS) +typedef word_t TAS_t; +#define TAS(m) CAS(m, 0, 1) +#endif + +#if defined(CAS) && !defined(SWAP) +#define SWAP(m, v) \ + ({ \ + register word_t _t_c1, _t_c2; \ + _t_c2 = _t_c1 = (m); \ + while ((_t_c1 = CAS(m, _t_c1, v)) != _t_c2) \ + ; \ + _t_c2 = _t_c1; \ + _t_c1; \ + }) +#endif + +#if defined(CAS) && !defined(ATOMADD) +#define ATOMADD(m, d) \ + ({ \ + word_t _v, _ov, _d; \ + _d = (d); \ + _ov = _v = (m); \ + while ((_v = CAS(m, _v, _v + _d)) != _ov) \ + _ov = _v; \ + _v + _d; \ + }) +#endif + +#if defined(SWAP) && !defined(ATOMADD) +#define ATOMADD(m, d) \ + ({ \ + word_t _v, _ov, _d; \ + _d = d; \ + _v = m; \ + while (_d) { \ + _ov = _v; \ + _v = SWAP(m, _v + _d); \ + _d = _v - _ov; \ + } \ + _v + _d; \ + }) +#endif + +#endif /* __GNUC__ */ + +#endif /* __atomic_h__ */ diff --git a/src/shm_malloc/make/Makefile b/src/shm_malloc/make/Makefile new file mode 100644 index 0000000..48f3135 --- /dev/null +++ b/src/shm_malloc/make/Makefile @@ -0,0 +1,53 @@ + +# The following uses gcc with optimizing and debugging symbols +CC = gcc +CFLAGS = -ggdb -O3 -Wall + +MAKELIB = rm -f $@; ar qv $@ $^; ranlib $@ + +SRCS = malloc.c malloc.h atomic.h tshm1.c tshm2.c Makefile malloc.doc shm_malloc.h +MOBJS = malloc.o +SHMOBJS = shm_malloc.o +DBOBJS = db_malloc.o +DBSHMOBJS = db_shm_malloc.o + +all: libmalloc.a libshm.a libdbmalloc.a libdbshm.a tshm1 tshm1db tanon tshm2 tshm2db + +tshm1: tshm1.c $(SHMOBJS) + $(CC) $(CFLAGS) -o $@ $^ + +tshm1db: tshm1.c $(DBSHMOBJS) + $(CC) $(CFLAGS) -DSHM_FILE='"tshm1db_file"' -o $@ $^ + +tanon: tshm1.c $(SHMOBJS) + $(CC) $(CFLAGS) -DSHM_FILE=0 -o $@ $^ + +tshm2: tshm2.c $(SHMOBJS) + $(CC) $(CFLAGS) -o $@ $^ + +tshm2db: tshm2.c $(DBSHMOBJS) + $(CC) $(CFLAGS) -o $@ $^ + +libmalloc.a: $(MOBJS); $(MAKELIB) +libshm.a: $(SHMOBJS); $(MAKELIB) +libdbmalloc.a: $(DBOBJS); $(MAKELIB) +libdbshm.a: $(DBSHMOBJS); $(MAKELIB) + +malloc.o: malloc.h + +shm_malloc.o: malloc.c malloc.h + $(CC) -c $(CFLAGS) -DSHM malloc.c -o $@ + +db_malloc.o: malloc.c malloc.h + $(CC) -c $(CFLAGS) -DMALLOC_DEBUG malloc.c -o $@ + +db_shm_malloc.o: malloc.c malloc.h + $(CC) -c $(CFLAGS) -DMALLOC_DEBUG -DSHM malloc.c -o $@ + +tar: $(SRCS) + -rm -f malloc.tar malloc.tar.gz + tar cvf malloc.tar $(SRCS) + gzip -9 malloc.tar + +clean: + -rm *.o *.a tshm1 tshm1db tshm2 tshm2db tanon diff --git a/src/shm_malloc/malloc.c b/src/shm_malloc/malloc.c new file mode 100644 index 0000000..5f25f6d --- /dev/null +++ b/src/shm_malloc/malloc.c @@ -0,0 +1,1589 @@ +#include +#include +#include +#include +#include +#include + +#ifdef SHM +#define malloc shm_malloc +#define realloc shm_realloc +#define free shm_free +#define calloc shm_calloc +#define malloc_small shm_malloc_small +#define valloc shm_valloc +#define sbrk shm_sbrk +#define brk shm_brk +#define minit abort +#define mresize shm_mresize +#define msize shm_msize +#define heapdump shm_heapdump +#else +#ifdef INDIRECT +#define malloc _malloc +#define realloc _realloc +#define free _free +#define calloc _calloc +#define valloc _valloc +#define mresize _mresize +#define msize _msize +#define malloc_small _malloc_small +#define heapdump _heapdump +#endif +extern void *sbrk(intptr_t); +extern int brk(void *); +#endif + +#define _S(x) #x +#define S(x) _S(x) + +/* these are the different locking schemes. The numbers associated with +** them are unimportant; they need only be different */ +#define SYSVSEM 1 /* SysV Semaphores */ +#define FLOCK 2 /* File Locks */ +#define SPINLOCK 3 /* atomic test-and-set spinlocks */ +#define PMUTEX 4 /* pthreads mutexes */ + +#if defined(SHM) || defined(_REENTRANT) || defined(_POSIX_THREADS) +#ifndef LOCKTYPE +#if defined(_POSIX_THREADS) /* && !defined(__CYGWIN__) */ +#define LOCKTYPE PMUTEX +#elif defined(__GNUC__) && \ + (defined(mc68000) || defined(sparc) || defined(m88k) || \ + defined(__alpha__) || defined(__ppc__) || defined(__i386__)) +#define LOCKTYPE SPINLOCK +#else +#define LOCKTYPE FLOCK +#endif +#endif + +#else /* !SHM && !_REENTRANT && !_POSIX_THREADS */ + +#undef LOCKTYPE + +#endif /* SHM || _REENTRANT || _POSIX_THREADS */ + +#if defined(__CYGWIN__) +/* don't try to use thread-local vars on cygwin */ +#define thread_local +#elif defined(__STDC__) && __STDC_VERSION__ >= 199901L +#define thread_local __thread +#elif defined(__GNUC__) && __GNUC__ >= 4 +#define thread_local __thread +#else +#define thread_local +#endif + +#include "malloc.h" + +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#undef SMLIST +static inline int SMLIST(int sz) { + int rv; + asm("bsr %1,%0" : "=r"(rv) : "r"((sz - 1) | 1)); + rv -= 7 - size256; + if (rv < 0) + rv = 0; + if (rv >= NUMSMALL) + rv = -1; + return rv; +} +#endif + +#ifdef SHM +#include +#include +#include +#include + +struct basepage *const membase = +#if __SIZEOF_POINTER__ == 8 +#define membase ((struct basepage *)0x1000000000L) +#else +#if defined(sun) +#define membase ((struct basepage *)0xe0000000) +#elif defined(sgi) +#define membase ((struct basepage *)0x08000000) +#elif defined(_AIX) +#define membase ((struct basepage *)0x40000000) +#elif defined(__FreeBSD__) +#define membase ((struct basepage *)0x10000000) +#elif defined(__hpux__) +#define membase ((struct basepage *)0xa0000000) +#elif defined(__CYGWIN__) +#define membase ((struct basepage *)0x40000000) +#else +#define membase ((struct basepage *)0x80000000) +#endif +#endif + membase; + +/* minimum number of additional pages to mmap when expanding the heap */ +#define MMAP_INCR 16 + +static void *localbrk; +static int mfd; + +#else /* !SHM */ + +static struct basepage *membase; + +#endif /* SHM */ + +#if defined(SHM) || defined(_REENTRANT) || defined(_POSIX_THREADS) +#if LOCKTYPE == SYSVSEM +#if defined(_AIX) || defined(__osf__) +/* AIX and OSF/1 have eliminated union semun, but are otherwise compatable */ +union semun { + int val; + struct semid_ds *buf; + ushort *array; +}; +#endif /* _AIX || __osf__ */ + +static int semid; +static struct sembuf sembuf; + +#define FIRSTKEY 1 /* first semaphore key to try */ +static int lock_init(int init) { + if (init) { + int um = umask(0); + umask(um); + um = ~um & 0777; + membase->semkey = FIRSTKEY; + while ((semid = semget(membase->semkey, NUMSMALL + 1, + IPC_CREAT | IPC_EXCL | um)) < 0 && + errno == EEXIST) + membase->semkey++; + if (semid >= 0) { + ushort arr[NUMSMALL + 1]; + int i; + union semun semu; + semu.array = arr; + for (i = 0; i <= NUMSMALL; i++) + arr[i] = 1; + if (semctl(semid, 0, SETALL, semu) < 0) { + semctl(semid, 0, IPC_RMID, semu); + return -1; + } + } + } else + semid = semget(membase->semkey, 0, 0); + return (semid < 0) ? -1 : 0; +} +#define LOCK(q) \ + do { \ + sembuf.sem_num = q < NUMSMALL ? q : NUMSMALL; \ + sembuf.sem_op = -1; \ + sembuf.sem_flg = 0; \ + while (semop(semid, &sembuf, 1) < 0) \ + assert(errno == EINTR); \ + } while (0) +#define UNLOCK(q) \ + do { \ + sembuf.sem_num = q < NUMSMALL ? q : NUMSMALL; \ + sembuf.sem_op = 1; \ + sembuf.sem_flg = 0; \ + while (semop(semid, &sembuf, 1) < 0) \ + assert(errno == EINTR); \ + } while (0) +#define LOCK_FINI +#define LOCK_DESTROY \ + do { \ + union semun semu; \ + semu.val = 0; \ + semctl(semid, 0, IPC_RMID, semu); \ + } while (0) +#endif /* SYSVSEM */ + +#if LOCKTYPE == FLOCK +static int lfd; +static struct flock lock; + +static int lock_init(int init) { + char lfile[1024]; + + strcpy(lfile, membase->mfile); + strcat(lfile, ".lock"); + lock.l_whence = SEEK_SET; + lock.l_len = 1; + if (!init) { + if ((lfd = open(lfile, O_RDWR, 0)) < 0) + return -1; + } else if ((lfd = open(lfile, O_RDWR | O_CREAT, 0666)) < 0) + return -1; + else + ftruncate(lfd, lfile, NUMSMALL + 1); + fcntl(lfd, F_SETFD, FD_CLOEXEC); + return 0; +} +#define LOCK(q) \ + do { \ + lock.l_type = F_WRLCK; \ + lock.l_start = (q); \ + while (fcntl(lfd, F_SETLKW, &lock) < 0) \ + assert(errno == EINTR); \ + } while (0) +#define UNLOCK(q) \ + do { \ + lock.l_type = F_UNLCK; \ + lock.l_start = (q); \ + while (fcntl(lfd, F_SETLK, &lock) < 0) \ + assert(errno == EINTR); \ + } while (0) +#define LOCK_FINI close(lfd) +#define LOCK_DESTROY \ + do { \ + char lfile[1024]; \ + strcpy(lfile, membase->mfile); \ + strcat(lfile, ".lock"); \ + unlink(lfile); \ + close(lfd); \ + } while (0) +#endif /* FLOCK */ + +#if LOCKTYPE == SPINLOCK +#include +static int lock_init(int init) { + if (init) { + int i; + for (i = NUMSMALL; i >= 0; i--) + membase->locks[i] = 0; + } + return 0; +} +#define LOCK(q) \ + do { \ + volatile TAS_t *_l = &membase->locks[q]; \ + int _try = 10; \ + while (_try > 0 && (*_l || TAS(*_l))) \ + _try--; \ + if (!_try) \ + while (*_l || TAS(*_l)) { \ + struct timeval to = {0, 1000}; \ + select(0, 0, 0, 0, &to); \ + } \ + MEMORY_BARRIER; \ + } while (0) +#define UNLOCK(q) \ + do { \ + MEMORY_BARRIER; \ + membase->locks[q] = 0; \ + } while (0) +#define LOCK_FINI +#define LOCK_DESTROY +#endif /* SPINLOCK */ + +#if LOCKTYPE == PMUTEX +static int lock_init(int init) { + if (init) { + int i; + for (i = NUMSMALL; i >= 0; i--) + pthread_mutex_init(&membase->locks[i], 0); + } + return 0; +} + +#define LOCK(q) pthread_mutex_lock(&membase->locks[q]) +#define UNLOCK(q) pthread_mutex_unlock(&membase->locks[q]) +#define LOCK_FINI +#define LOCK_DESTROY +#endif /* PMUTEX */ + +#else /* !SHM && !_REENTRANT && !_POSIX_THREADS */ + +#define LOCK(q) +#define UNLOCK(q) +static int lock_init() { return 0; } + +#endif /* SHM || _REENTRANT || _POSIX_THREADS */ + +typedef unsigned long U; + +#define TARGET(l) (2 << ((NUMSMALL - 1 - (l)) / 2)) +#define PAGENUM(p) (((U)(p) - (U)membase) / PAGESIZE) +#define PAGEADDR(n) ((void *)((U)membase + (U)(n)*PAGESIZE)) +#define PAGEBASE(p) ((U)p & ~(PAGESIZE - 1)) +#define I2(pn) ((pn) % (PAGESIZE / sizeof(struct page))) +#define I1(pn) ((pn) / (PAGESIZE / sizeof(struct page))) +#define ADDR2PAGE(p) (&membase->pages[I1(PAGENUM(p))][I2(PAGENUM(p))]) +#define NUM2PAGE(n) (&membase->pages[I1(n)][I2(n)]) +#define VALID(p) (((U)(p) > (U)membase) && ((U)(p) < (U)membase->end)) +#define FREEPAGE(n) ((struct freepage *)PAGEADDR(n)) + +#ifdef MALLOC_DEBUG +static unsigned long lcrng(unsigned long s) { + unsigned long long mod = (1LL << 31) - 1; + unsigned long long t = s * 16807LL; + + t = (t & mod) + (t >> 31); + if (t > mod) + t -= mod; + return t; +} + +#define GUARD 0xa1962f8dU +#define DB(code) code +#else /* !MALLOC_DEBUG */ +#define DB(code) +#endif /* MALLOC_DEBUG */ + +static inline int pcmp(unsigned _a, unsigned _b) { + struct page *a = NUM2PAGE(_a), *b = NUM2PAGE(_b); + int v; + + v = a->count - b->count; + return v ? v : (long)_a - (long)_b; +} + +#if 0 +/* + * FIXME -- profile this sorter and maybe choose a better one? + * FIXME -- this pivot choice is pessimal for a reversed list, but very + * FIXME -- good (O(n)) for almost sorted lists, which should be our + * FIXME -- common case. Probably not a big deal as the lists should + * FIXME -- rarely be big + * FIXME -- we're also assuming the optimizer will do a good job CSEing + * FIXME -- these NUM2PAGE macros after inlining pcmp + * + * This algorithm has very bad behavior with a list that is sorted except + * for the last element, which turns out to be a somewhat common case here. + * + * Quicksort with last-sorted pivot + */ +unsigned page_list_sort(unsigned p, unsigned **tail) +{ +unsigned a, *a_tail, b, *b_tail, pivot; + + if (!p) return p; + pivot = a = p; + a_tail = &a; + p = NUM2PAGE(p)->page; + while (p) { + if (pcmp(pivot, p) > 0) break; + a_tail = &NUM2PAGE(pivot)->page; + pivot = p; + p = NUM2PAGE(p)->page; } + if (!p) { + if (tail) *tail = &NUM2PAGE(pivot)->page; + return a; } + b_tail = &b; + while (p) { + if (pcmp(pivot, p) > 0) { + *a_tail = p; + a_tail = &NUM2PAGE(p)->page; } + else { + *b_tail = p; + b_tail = &NUM2PAGE(p)->page; } + p = NUM2PAGE(p)->page; } + *a_tail = 0; + *b_tail = 0; + if (a) a = page_list_sort(a, &a_tail); + if (b) b = page_list_sort(b, &b_tail); + *a_tail = pivot; + NUM2PAGE(pivot)->page = b; + if (tail) + *tail = b ? b_tail : &NUM2PAGE(pivot)->page; + return a; +} +#else +/* + * FIXME -- profile this sorter and maybe choose a better one? + * + * simple split/merge sort */ +unsigned page_list_sort(unsigned p) { + unsigned a, b, *t; + int asort = 0; + + if (!p) + return p; + a = b = p; + b = NUM2PAGE(b)->page; + if (!b) + return p; + while (b) { + if (!(b = NUM2PAGE(b)->page)) + break; + unsigned l = a; + a = NUM2PAGE(a)->page; + if (!asort && pcmp(l, a) > 0) + asort = 1; + b = NUM2PAGE(b)->page; + } + b = page_list_sort(NUM2PAGE(a)->page); + NUM2PAGE(a)->page = 0; + a = p; + if (asort) + a = page_list_sort(a); + t = &p; + while (a && b) { + if (pcmp(a, b) <= 0) { + *t = a; + t = &NUM2PAGE(a)->page; + a = *t; + } else { + *t = b; + t = &NUM2PAGE(b)->page; + b = *t; + } + } + *t = a | b; + return p; +} +#define page_list_sort(p, t) page_list_sort(p) +#endif + +static thread_local struct localfree freelists[NUMSMALL]; +#ifdef MALLOC_DEBUG +static thread_local struct backup { + struct backup *next; + struct chunk *item; +} * backupfree[NUMSMALL], *backupaux[NUMSMALL], *spare; + +static void tbackup(int i) { + struct chunk *p; + struct backup *q; + + for (p = freelists[i].free, q = backupfree[i]; p && q; + p = p->next, q = q->next) + if (p != q->item) { + printf("***" S(free) " list for size %d corrupted\n", SIZE(i)); + abort(); + } + if (p || q) { + printf("***" S(free) " list for size %d corrupted\n", SIZE(i)); + abort(); + } + for (p = freelists[i].aux, q = backupaux[i]; p && q; p = p->next, q = q->next) + if (p != q->item) { + printf("***" S(free) " list for size %d corrupted\n", SIZE(i)); + abort(); + } + if (p || q) { + printf("***" S(free) " list for size %d corrupted\n", SIZE(i)); + abort(); + } +} + +static struct backup *balloc() { + struct backup *p; + int i; + + if (!spare) { + p = valloc(PAGESIZE); + i = PAGESIZE / sizeof(struct backup) - 1; + p[i].next = 0; + for (i--; i >= 0; i--) + p[i].next = &p[i + 1]; + spare = p; + } + p = spare; + spare = p->next; + return p; +} + +static void bfree(struct backup *p) { + p->next = spare; + spare = p; +} + +static void *gcheck(void *_b) { + U *b = _b; + if (b) { + b -= 2; + if (b[0] != GUARD || b[1] != lcrng((U)b)) { + printf("***guard corrupted at %p\n", b); + abort(); + } + } + return b; +} + +static void *gsetup(void *_b) { + U *b = _b; + if (b) { + *b++ = GUARD; + *b++ = lcrng((U)_b); + } + return b; +} +#endif /* MALLOC_DEBUG */ + +void *malloc(size_t size) { + int sc; + void *rv; + + DB(size += 2 * sizeof(U);) + sc = SMLIST(size); + if (sc >= 0) + rv = malloc_small(sc); + else + rv = valloc(size); + DB(rv = gsetup(rv);) + return rv; +} + +static void msetup() { + memset(membase, 0, 3 * PAGESIZE); + memcpy(membase->magic, "SHM ", 4); + membase->param[0] = 0; + membase->param[1] = sizeof(void *); +#ifdef LOCKTYPE + membase->param[2] = LOCKTYPE; +#endif +#ifdef MALLOC_DEBUG + membase->param[2] |= 0x80; +#endif + membase->param[3] = NUMSMALL; + membase->base = membase; + membase->pages = (struct page **)((U)membase + PAGESIZE); + membase->end = (void *)((U)membase + PAGESIZE * 3); + membase->pages[0] = (struct page *)((U)membase + PAGESIZE * 2); + membase->pages[0][0].code = BASE; + membase->pages[0][1].code = BASE; + membase->pages[0][2].code = BASE; +} + +#ifdef SHM +static int msetup_valid() { + if (memcmp(membase->magic, "SHM ", 4)) + return 0; + if (membase->param[0] != 0) + return 0; + if (membase->param[1] != sizeof(void *)) + return 0; + if ((membase->param[2] & 0x7f) != LOCKTYPE) + return 0; +#ifdef MALLOC_DEBUG + if (!(membase->param[2] & 0x80)) + return 0; +#else + if (membase->param[2] & 0x80) + return 0; +#endif + if (membase->param[3] != NUMSMALL) + return 0; + if (membase->base != membase) + return 0; + return 1; +} +#endif + +#ifndef SHM +static void minit() { + U p; + + p = (U)sbrk(0); + if (p % PAGESIZE) { + sbrk(PAGESIZE - p % PAGESIZE); + p = (U)sbrk(0); + } + assert(p % PAGESIZE == 0); + + membase = (struct basepage *)p; + sbrk(PAGESIZE * 3); + msetup(); + lock_init(1); +} + +#ifdef WINNT +int brk(void *p) { + void *op = sbrk(0); + + return (int)sbrk((int)p - (int)op); +} +#endif /* WINNT */ + +#else /* SHM */ + +#ifdef __CYGWIN__ + +/* Cygwin's mmap can't deal with multiple partial mappings of a file, so + * in order to map more of our shared mem file, we need to unmap what we + * have mapped and then remap the whole thing as one chunk. This doesn't + * work for anonymous mapping (we'd lose what was previously mapped), so + * we always do them in multiples of 64K which seems to work out ok */ + +static void *cygwin_mmap(void *addr, size_t length, int prot, int flags, int fd, + off_t offset) { + if (fd >= 0 && addr != membase) { + munmap(membase, offset); + length += offset; + addr = membase; + offset = 0; + } else if (fd < 0) { + /* how much was already mapped by a previous mmap */ + size_t done = -(intptr_t)addr & 0xffff; + if (length <= done) + return addr; + addr = (char *)addr + done; + length -= done; + /* round up to 64K */ + length |= 0xffff; + length++; + offset = 0; /* should be ignored by mmap */ + } + return mmap(addr, length, prot, flags, fd, offset); +} + +#define mmap cygwin_mmap + +#endif /* __CYGWIN__ */ + +static struct sigaction oldsegv; + +static void shm_segv() { + void *newbrk; + int flags = MAP_SHARED | MAP_FIXED; + + /* if a SEGV occurred and there's new memory to be mapped, map it + ** and retry */ + if (mfd < 0) + flags |= MAP_ANONYMOUS; + newbrk = membase->eof; + if (newbrk > localbrk) { + if (mfd >= 0) + lseek(mfd, 0, SEEK_SET); + mmap(localbrk, newbrk - localbrk, PROT_READ | PROT_WRITE, flags, mfd, + localbrk - (void *)membase); + localbrk = newbrk; + } else { + /* no more to map, must be a real SEGV */ + sigaction(SIGSEGV, &oldsegv, 0); + } +} + +int shm_destroy() { + LOCK_DESTROY; + unlink(membase->mfile); + munmap(membase, localbrk - (void *)membase); + close(mfd); + return 0; +} + +int shm_init(const char *mfile, void (*init_fn)()) { + int tmp, wait = 5; + int flags = MAP_SHARED | MAP_FIXED; + struct sigaction segv; + + mfd = -1; + if (!mfile) + flags |= MAP_ANONYMOUS; + while (mfd == -1) { + if (mfile && (mfd = open(mfile, O_RDWR, 0)) >= 0) { + /* make sure the file isn't empty */ + while (read(mfd, &tmp, sizeof(tmp)) == 0) { + if (wait-- < 0) { + close(mfd); + errno = EINVAL; + return -1; + } + sleep(1); + } + lseek(mfd, 0, SEEK_SET); + if ((long)mmap(membase, PAGESIZE, PROT_READ | PROT_WRITE, flags, mfd, + 0) == -1) { + close(mfd); + return -1; + } + /* wait until initialization is complete */ + while (!membase->init && wait-- > 0) + sleep(1); + if (!membase->init || !msetup_valid()) { + munmap(membase, PAGESIZE); + close(mfd); + errno = EINVAL; + return -1; + } + localbrk = membase->eof; + lseek(mfd, 0, SEEK_SET); + if ((long)mmap((void *)membase + PAGESIZE, + (localbrk - (void *)membase) - PAGESIZE, + PROT_READ | PROT_WRITE, flags, mfd, PAGESIZE) == -1) { + close(mfd); + return -1; + } + } else if (!mfile || + (errno == ENOENT && + (mfd = open(mfile, O_RDWR | O_CREAT | O_EXCL, 0666)) >= 0)) { + if (mfd >= 0) { + if (ftruncate(mfd, 3 * PAGESIZE) < 0) { + close(mfd); + return -1; + } + lseek(mfd, 0, SEEK_SET); + } + if ((long)mmap(membase, 3 * PAGESIZE, PROT_READ | PROT_WRITE, flags, mfd, + 0) == -1) { + close(mfd); + return -1; + } + msetup(); + localbrk = membase->brk = membase->eof = membase->end; + strcpy(membase->mfile, mfile ? mfile : ""); + membase->global = 0; + break; + } else if (errno != EEXIST) + return -1; + } + fcntl(mfd, F_SETFD, FD_CLOEXEC); + if (lock_init(!membase->init) < 0) { + munmap(membase, localbrk - (void *)membase); + close(mfd); + return -1; + } + if (!membase->init && init_fn) + init_fn(); + segv.sa_flags = 0; + sigemptyset(&segv.sa_mask); + segv.sa_handler = shm_segv; + sigaction(SIGSEGV, &segv, &oldsegv); + membase->init = 1; + return 0; +} + +static void flush_to_global_freelist(int, struct chunk *, struct chunk *); + +/* flush out local free lists, so we can exit leaving memory consistent */ +int shm_fini() { + int l; + + for (l = 0; l < NUMSMALL; l++) { + if (freelists[l].aux || freelists[l].free) + flush_to_global_freelist(l, freelists[l].free, freelists[l].aux); + freelists[l].aux = freelists[l].free = 0; + freelists[l].count = 0; + } + // munmap(membase, localbrk - (void *)membase); + // close(mfd); + LOCK_FINI; + return 0; +} + +/* clear all the free lists, as they really belong to the parent */ +int shm_child() { + int l; + + for (l = 0; l < NUMSMALL; l++) + freelists[l].aux = freelists[l].free = 0; + return 0; +} + +static int shm_brk(void *newbrk) { + char tmp = 0; + int flags = MAP_SHARED | MAP_FIXED; + + if (mfd < 0) + flags |= MAP_ANONYMOUS; + if (newbrk <= membase->brk) { + if (ftruncate(mfd, newbrk - (void *)membase) < 0) + return -1; + if (newbrk < membase->eof) + munmap(newbrk, membase->eof - newbrk); + membase->brk = membase->eof = localbrk = newbrk; + } else if (newbrk <= membase->eof) { + membase->brk = newbrk; + if (newbrk > localbrk) { + if (mfd >= 0) + lseek(mfd, 0, SEEK_SET); + if ((long)mmap(localbrk, membase->eof - localbrk, PROT_READ | PROT_WRITE, + flags, mfd, localbrk - (void *)membase) == -1) + return -1; + localbrk = membase->eof; + } + } else { + void *neweof = membase->brk + PAGESIZE * MMAP_INCR; + if (newbrk > neweof) + neweof = newbrk; + if (mfd >= 0) { + if (ftruncate(mfd, neweof - (void *)membase) < 0) + return -1; + if (lseek(mfd, neweof - (void *)membase - 1, SEEK_SET) < 0) + return -1; + if (write(mfd, &tmp, 1) != 1) + return -1; + lseek(mfd, 0, SEEK_SET); + } + membase->eof = neweof; + if ((long)mmap(localbrk, neweof - localbrk, PROT_READ | PROT_WRITE, flags, + mfd, localbrk - (void *)membase) == -1) + return -1; + localbrk = neweof; + membase->brk = newbrk; + } + return 0; +} + +static void *shm_sbrk(int delta) { + void *oldbrk = membase->brk; + + return shm_brk(membase->brk + delta) < 0 ? (void *)-1 : oldbrk; +} + +void *shm_global() { return membase->global; } + +void shm_set_global(void *v) { membase->global = v; } + +#endif /* SHM */ + +/* +** The free page list contains all the entirely free pages. It is organized +** as a `list of lists' with blocks of the same size in the same list. +** the lists are sorted order of size (smallest first), and each list is +** sorted in memory order (lowest address first) +*/ + +#ifdef MALLOC_DEBUG +/* check the global freepage lists to ensure consistency, and ensure that + * 'p' is present (free) on there */ +static void fp_verify(struct freepage *p) { + struct freepage *t1, *t2; + struct page *pp; + int i; + + if (membase->freepages && + (!VALID(membase->freepages) || + membase->freepages->parent != &membase->freepages)) { + printf("***" S(free) "list corrupt (base table)\n"); + abort(); + } + for (t1 = membase->freepages; t1; t1 = t1->bigger) { + if (t1->bigger && (!VALID(t1->bigger) || t1->size >= t1->bigger->size || + t1->bigger->parent != &t1->bigger)) { + printf("***" S(free) "list corrupt (page %p ?)\n", t1); + abort(); + } + for (t2 = t1; t2; t2 = t2->next) { + if (p == t2) + p = 0; + if (t2->next && + (!VALID(t2->next) || t2->next->bigger || t2->size != t2->next->size || + t2->next->parent != &t2->next)) { + printf("***" S(free) "list corrupt (page %p ?)\n", t2); + abort(); + } + pp = ADDR2PAGE(t2); + if (pp->code != BIG + FREE || + PAGEADDR(pp->page - t2->size) != (void *)t2) { + printf("***page tables corrupt (page %p)\n", t2); + abort(); + } + for (i = 1; i < t2->size; i++) { + struct page *ip = NUM2PAGE(PAGENUM(t2) + i); + if (ip->code != MIDDLE || PAGEADDR(ip->page) != (void *)t2) { + printf("***page tables corrupt (page %p)\n", + (char *)t2 + i * PAGESIZE); + abort(); + } + } + } + } + if (p) { + printf("***apparently free page %p not on " S(free) "list\n", p); + abort(); + } +} +#else /* !MALLOC_DEBUG */ +#define fp_verify(p) +#endif /* MALLOC_DEBUG */ + +static void fp_remove(struct freepage *p) { + fp_verify(p); + if (p->next) { + (*p->parent) = p->next; + p->next->parent = p->parent; + if ((p->next->bigger = p->bigger)) + p->bigger->parent = &p->next->bigger; + } else { + if (((*p->parent) = p->bigger)) + p->bigger->parent = p->parent; + } +} + +static void fp_add(struct freepage *p) { + struct freepage **t = &membase->freepages; + + fp_verify(0); + while (*t && (*t)->size < p->size) + t = &(*t)->bigger; + if (*t && (*t)->size == p->size) { + while (*t && (U)*t < (U)p) + t = &(*t)->next; + if ((p->next = (*t))) { + if ((p->bigger = p->next->bigger)) { + p->bigger->parent = &p->bigger; + p->next->bigger = 0; + } + p->next->parent = &p->next; + } else + p->bigger = 0; + } else { + p->next = 0; + if ((p->bigger = (*t))) + p->bigger->parent = &p->bigger; + } + *t = p; + p->parent = t; +} + +static struct freepage *fp_find(U size) { + struct freepage *t; + + fp_verify(0); + for (t = membase->freepages; t && t->size < (int)size; t = t->bigger) + ; + if (t) + fp_remove(t); + return t; +} + +void *malloc_small(int l) { + struct chunk *new; + + if (!membase) + minit(); + assert(l >= SMLIST(sizeof(void *)) && l < NUMSMALL); + DB(tbackup(l)); + if (!freelists[l].free) { + if (freelists[l].aux) { + freelists[l].free = freelists[l].aux; + freelists[l].aux = 0; + DB(backupfree[l] = backupaux[l]); + DB(backupaux[l] = 0); + } else { + int i; + struct chunk *new_fl = 0; + if (!freelists[l].target) + freelists[l].target = TARGET(l); + LOCK(l); + for (i = freelists[l].target; i; i--) { + unsigned pn; + struct page *p; + if (!(pn = membase->freechunks[l])) { + int j; + if (!(new = valloc(PAGESIZE))) { + UNLOCK(l); + return 0; + } + pn = PAGENUM(new); + p = ADDR2PAGE(new); + p->code = l; + p->count = j = PERPAGE(l); + p->free = 0; + p->page = 0; + while (--j) { + struct chunk *prev = new; + new = (struct chunk *)((U) new + SIZE(l)); + prev->next = new; + } + new->next = 0; + membase->freechunks[l] = pn; + } + p = NUM2PAGE(pn); + new = (struct chunk *)((U)PAGEADDR(pn) + p->free * SIZE(l)); + if (new->next) + p->free = ((U) new->next - (U)PAGEADDR(pn)) / SIZE(l); + else { + p->free = 0; + assert(p->count == 1); + } + if (!--p->count) { + assert(p->free == 0); + membase->freechunks[l] = p->page; + } + new->next = new_fl; + new_fl = new; + } + freelists[l].free = new_fl; + DB({ + struct chunk *p; + struct backup **q; + for (p = new_fl, q = &backupfree[l]; p; p = p->next, q = &(*q)->next) { + *q = balloc(); + (*q)->item = p; + } + *q = 0; + }); + UNLOCK(l); + } + freelists[l].count = freelists[l].target; + } + new = freelists[l].free; + freelists[l].free = new->next; + DB({ + struct backup *tmp = backupfree[l]; + backupfree[l] = tmp->next; + bfree(tmp); + }); + freelists[l].count--; + DB(tbackup(l)); + return (void *)new; +} + +/* allocate 'size' pages without expanding the heap. + * Return 0 if that's not possible + * must hold LOCK(NUMSMALL) before calling */ +static void *alloc_pages(int size) { + unsigned i; + void *p = fp_find(size); + if (p) { + unsigned pn = PAGENUM(p); + struct page *pg = NUM2PAGE(pn); + pg->code = BIG; + if (pg->page - pn > (int)size) { + unsigned extra = pn + size; + struct page *extrapg = NUM2PAGE(extra); + extrapg->code = BIG + FREE; + extrapg->page = pg->page; + FREEPAGE(extra)->size = i = pg->page - extra; + while (--i) + NUM2PAGE(extra + i)->page = extra; + fp_add(FREEPAGE(extra)); + } + } + return p; +} + +/* Free a block of one or more pages, without shrinking the heap. + * Coalesce with adjacent free block and return the resulting (possibly + * larger) free block + * Must hold LOCK(NUMSMALL) before calling. */ +static struct freepage *free_pages(void *p) { + unsigned i, adj; + struct page *pg = ADDR2PAGE(p), *adjpg; + struct freepage *fpage = p; + assert(pg->code == BIG); + pg->code = BIG + FREE; + adj = PAGENUM(p) - 1; + adjpg = NUM2PAGE(adj); + if (adjpg->code == MIDDLE) { + adj = adjpg->page; + adjpg = NUM2PAGE(adj); + } + if (adjpg->code == BIG + FREE) { + fpage = FREEPAGE(adj); + fp_remove(fpage); + adjpg->page = pg->page; + pg->code = MIDDLE; + for (i = PAGENUM(p); i < adjpg->page; i++) + NUM2PAGE(i)->page = adj; + pg = adjpg; + fpage->size = adjpg->page - adj; + } else + fpage->size = pg->page - PAGENUM(p); + if (PAGEADDR(pg->page) < membase->end) { + adj = pg->page; + adjpg = NUM2PAGE(adj); + if (adjpg->code == BIG + FREE) { + fp_remove(FREEPAGE(adj)); + adjpg->code = MIDDLE; + pg->page = adjpg->page; + for (i = adj; i < pg->page; i++) + NUM2PAGE(i)->page = PAGENUM(fpage); + fpage->size = pg->page - PAGENUM(fpage); + } + } + fp_add(fpage); + return fpage; +} + +/* Initialize the page descriptors for an extent of memory that is in use. + * Must hold LOCK(NUMSMALL) before calling. */ +static void setup_extent_descriptor(void *p, int size) { + struct page *pg = ADDR2PAGE(p); + pg->page = PAGENUM(p) + size; + pg->code = BIG; + while (--size > 0) { + pg = NUM2PAGE(PAGENUM(p) + size); + pg->page = PAGENUM(p); + pg->code = MIDDLE; + } +} + +/* Expand the master page descriptor table to contain descriptors for pages. + * up to "to". Must hold LOCK(NUMSMALL) before calling. */ +static int expand_page_table(unsigned to) { + int i, added_pages = 0, newmastersize = 0; + unsigned old = PAGENUM(membase->end) - 1; + void *oldmaster = 0; + if (PAGENUM(&membase->pages[I1(to)]) != PAGENUM(&membase->pages[I1(old)])) { + /* FIXME -- there's a race condition here when we resize the top-level + * FIXME -- pages table, as everyone accesses it without aquiring a + * FIXME -- lock. So we ensure that the old top-level table remains + * FIXME -- valid for awhile after being reallocated. That way. if + * FIXME -- someone is in the middle of accessing it, they'll still + * FIXME -- get the right info. as long as they're not delayed */ + struct page **master; + int oldmastersize = I1(old) / (PAGESIZE / sizeof(struct page *)) + 1; + newmastersize = I1(to) / (PAGESIZE / sizeof(struct page *)) + 1; + if (!(master = alloc_pages(newmastersize))) { + to += newmastersize; + newmastersize = I1(to) / (PAGESIZE / sizeof(struct page *)) + 1; + if ((master = sbrk(newmastersize * PAGESIZE)) == (void *)-1) + return 0; + membase->end = (void *)((U)master + newmastersize * PAGESIZE); + } + memcpy(master, membase->pages, oldmastersize * PAGESIZE); + memset((void *)((U)master + oldmastersize * PAGESIZE), 0, + (newmastersize - oldmastersize) * PAGESIZE); + void *oldmaster = membase->pages; + membase->pages = master; + /* remark the old master as a generic extent, so we can free it */ + setup_extent_descriptor(oldmaster, oldmastersize); + } + for (i = I1(old) + 1; i <= I1(to); i++) { + assert(membase->pages[i] == 0); + if ((membase->pages[i] = alloc_pages(1))) { + ADDR2PAGE(membase->pages[i])->code = BASE; + } else { + if ((membase->pages[i] = sbrk(PAGESIZE)) == (void *)-1) { + if (oldmaster) + free_pages(oldmaster); + return 0; + } + added_pages++; + } + memset(membase->pages[i], 0, PAGESIZE); + } + membase->end = PAGEADDR(to + 1); + if (newmastersize) + for (i = 0; i < newmastersize; i++) + NUM2PAGE(PAGENUM(membase->pages) + i)->code = BASE; + if (added_pages) { + if (!expand_page_table(to + added_pages)) { + if (oldmaster) + free_pages(oldmaster); + return 0; + } + for (i = I1(to); added_pages; i--, added_pages--) + ADDR2PAGE(membase->pages[i])->code = BASE; + } + if (oldmaster) + free_pages(oldmaster); + return 1; +} + +void *valloc(size_t size) { + void *new; + + size = (size + PAGESIZE - 1) / PAGESIZE; /* size in pages */ + if (!membase) + minit(); + LOCK(NUMSMALL); + if (!(new = alloc_pages(size))) { + if ((new = sbrk(size * PAGESIZE)) == (void *)-1) { + UNLOCK(NUMSMALL); + return 0; + } + if ((U) new % PAGESIZE) { + if (sbrk(PAGESIZE - (U) new % PAGESIZE) == (void *)-1) { + if (brk(new)) /* ignore return value */ + ; + UNLOCK(NUMSMALL); + return 0; + } + new += PAGESIZE - (U) new % PAGESIZE; + } + if (I1(PAGENUM(new) + size - 1) != I1(PAGENUM(membase->end) - 1)) { + if (!expand_page_table(PAGENUM(new) + size - 1)) { + if ((U) new > (U)membase->end) { + if (brk(new)) /* ignore return value */ + ; + UNLOCK(NUMSMALL); + return 0; + } + } + } else + membase->end = new + size *PAGESIZE; + setup_extent_descriptor(new, size); + } + UNLOCK(NUMSMALL); + return new; +} + +static void flush_to_global_freelist(int l, struct chunk *cp, + struct chunk *cp2) { + struct chunk *tmp; + struct page *p; + + LOCK(l); + if (!cp) { + cp = cp2; + cp2 = 0; + } + for (; cp; cp = tmp) { + if (!(tmp = cp->next)) { + tmp = cp2; + cp2 = 0; + } + p = ADDR2PAGE(cp); + cp->next = (void *)(p->count ? PAGEBASE(cp) + p->free * SIZE(l) : 0); + p->free = ((U)cp - PAGEBASE(cp)) / SIZE(l); + if (!p->count) { + p->page = membase->freechunks[l]; + membase->freechunks[l] = PAGENUM(cp); + } + if (++p->count >= PERPAGE(l)) { + p->count = -1; + } + } + membase->freechunks[l] = page_list_sort(membase->freechunks[l], 0); + while (membase->freechunks[l] && + (p = NUM2PAGE(membase->freechunks[l]))->count < 0) { + unsigned pn = membase->freechunks[l]; + void *pp = PAGEADDR(pn); + membase->freechunks[l] = p->page; + p->code = BIG; + p->count = 0; + p->free = 0; + p->page = pn + 1; + DB(pp = gsetup(pp);) + free(pp); + } + UNLOCK(l); +} + +void free(void *_old) { + struct chunk *old = _old; + struct page *p; + int l; +#ifdef MALLOC_DEBUG + struct chunk *t, *last; + int i; +#endif /* MALLOC_DEBUG */ + + if (!old) + return; +#ifdef MALLOC_DEBUG + if ((U)old < (U)membase || (U)old >= (U)membase->end) { + printf("***Invalid pointer given to " S(free) " %p\n", old); + abort(); + } +#endif /* MALLOC_DEBUG */ + DB(old = gcheck(old);) + p = ADDR2PAGE(old); + if ((l = p->code) < NUMSMALL) { +#ifdef MALLOC_DEBUG + if (((U)old & (SIZE(l) - 1)) != 0) { + printf("***Invalid pointer given to " S(free) " %p\n", old); + abort(); + } + for (last = 0, t = freelists[l].free, i = 0; t; + last = t, t = t->next, i++) { + if (t == old) { + printf("***double " S(free) " of %p\n", old); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if (!VALID(t) || ADDR2PAGE(t)->code != l) { + printf("***" S(free) "list corrupt (freelist %d)\n", l); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if (i > freelists[l].count) { + printf("***" S(free) "list corrupt (freelist %d)\n", l); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + } + if (i != freelists[l].count) { + printf("***" S(free) "list corrupt (freelist %d)\n", l); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + for (last = 0, t = freelists[l].aux, i = 0; t; last = t, t = t->next, i++) { + if (t == old) { + printf("***double " S(free) " of %p\n", old); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if (!VALID(t) || ADDR2PAGE(t)->code != l) { + printf("***" S(free) "list corrupt (auxlist %d)\n", l); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if (i > freelists[l].target) { + printf("***" S(free) "list corrupt (auxlist %d)\n", l); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + } + if (i && i != freelists[l].target) { + printf("***" S(free) "list corrupt (auxlist %d)\n", l); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if (p->count) { + t = (void *)(PAGEBASE(old) + p->free * SIZE(l)); + for (last = 0, i = 0; t; last = t, t = t->next, i++) { + if (t == old) { + printf("***double " S(free) " of %p\n", old); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if ((U)t / PAGESIZE != (U)old / PAGESIZE) { + printf("***" S(free) "list corrupt (page %p)\n", + (void *)((U)old & ~(PAGESIZE - 1))); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + if (i > p->count) { + printf("***" S(free) "list corrupt (page %p)\n", + (void *)((U)old & ~(PAGESIZE - 1))); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + } + if (i != p->count) { + printf("***" S(free) "list corrupt (page %p)\n", + (void *)((U)old & ~(PAGESIZE - 1))); + if (last) + printf(" (block at %p ?)\n", last); + abort(); + } + } +#endif /* MALLOC_DEBUG */ + DB(tbackup(l)); + if (freelists[l].count == freelists[l].target) { + if (freelists[l].aux) { + struct chunk *tmp = freelists[l].aux; + freelists[l].aux = 0; + flush_to_global_freelist(l, tmp, 0); + DB({ + struct backup *p; + for (p = backupaux[l]; p->next; p = p->next) + ; + p->next = spare; + spare = backupaux[l]; + backupaux[l] = 0; + }) + } + freelists[l].aux = freelists[l].free; + freelists[l].free = 0; + DB(backupaux[l] = backupfree[l]); + DB(backupfree[l] = 0); + freelists[l].count = 0; + } + old->next = freelists[l].free; + freelists[l].count++; + freelists[l].free = old; + DB({ + struct backup *p = balloc(); + p->next = backupfree[l]; + p->item = old; + backupfree[l] = p; + }) + DB(tbackup(l)); + } else { + struct freepage *fpage; + assert(l == BIG); + assert(((U)old & (PAGESIZE - 1)) == 0); + LOCK(NUMSMALL); + fpage = free_pages(old); + if ((void *)((U)fpage + fpage->size * PAGESIZE) == membase->end && + sbrk(0) == membase->end) { + fp_remove(fpage); + sbrk((U)fpage - (U)membase->end); + membase->end = fpage; + } + UNLOCK(NUMSMALL); + } +} + +int mresize(void *old, U size) { + unsigned t, i; + struct page *op, *tpg; + int nl; + void *pp; + + if (!old) + return 0; +#ifdef MALLOC_DEBUG + if ((U)old < (U)membase || (U)old >= (U)membase->end) { + printf("***Invalid pointer given to " S(mresize) " %p\n", old); + abort(); + } +#endif /* MALLOC_DEBUG */ + DB(old = gcheck(old); size += 2 * sizeof(U);) + op = ADDR2PAGE(old); + nl = SMLIST(size); + if (op->code == nl) + return 1; + if (op->code == BIG && nl == -1) { + size = (size + PAGESIZE - 1) / PAGESIZE; + if ((int)size > op->page - PAGENUM(old)) { + LOCK(NUMSMALL); + if (PAGEADDR(op->page) == membase->end || + (tpg = NUM2PAGE(op->page))->code != BIG + FREE || + tpg->page - PAGENUM(old) < (int)size) { + UNLOCK(NUMSMALL); + return 0; + } + fp_remove(FREEPAGE(op->page)); + tpg->code = MIDDLE; + for (i = op->page, op->page = t = tpg->page; i < t; i++) + NUM2PAGE(i)->page = PAGENUM(old); + UNLOCK(NUMSMALL); + } + if ((int)size < op->page - PAGENUM(old)) { + LOCK(NUMSMALL); + t = PAGENUM(old) + size; + tpg = NUM2PAGE(t); + tpg->code = BIG; + tpg->page = op->page; + for (i = op->page - 1; i > t; i--) + NUM2PAGE(i)->page = t; + op->page = t; + UNLOCK(NUMSMALL); + pp = PAGEADDR(t); + DB(pp = gsetup(pp);) + free(pp); + } + assert((int)size == op->page - PAGENUM(old)); + return 1; + } + return 0; +} + +U msize(void *p) { + struct page *pg; + U size; + +#ifdef MALLOC_DEBUG + if ((U)p < (U)membase || (U)p >= (U)membase->end) { + printf("***Invalid pointer given to " S(msize) " %p\n", p); + abort(); + } +#endif /* MALLOC_DEBUG */ + DB(p = gcheck(p);) + pg = ADDR2PAGE(p); + if (pg->code < NUMSMALL) { + assert(((U)p & (SIZE(pg->code) - 1)) == 0); + size = SIZE(pg->code); + } else { + assert((pg->code & ~FREE) == BIG); + assert(((U)p & (PAGESIZE - 1)) == 0); + size = (pg->page - PAGENUM(p)) * PAGESIZE; + } + DB(size -= 2 * sizeof(U)); + return size; +} + +void *realloc(void *old, size_t size) { + if (size == 0) { + free(old); + return 0; + } else if (!old) { + return malloc(size); + } else if (mresize(old, size)) { + return old; + } else { + U osize = msize(old); + void *new = malloc(size); + + if (size > osize) + size = osize; + if (new) { + memcpy(new, old, size); + free(old); + } + return new; + } +} + +void *calloc(size_t n1, size_t n2) { + U size = n1 * n2; + void *new = malloc(size); + + if (new) + memset(new, 0, size); + return new; +} + +void heapdump() { + void *p; + struct page *pg; + struct chunk *cp; + unsigned i, j, cnt, lbig = 0; + char buffer[64]; + struct freepage *p1, *p2; + + cnt = ((U)membase->end - (U)membase) / PAGESIZE; + printf("membase = %p, end = %p, %u pages total", membase, membase->end, cnt); + for (i = 0, p = membase; p < membase->end; p += PAGESIZE, i++) { + if (i % 8 == 0) + printf("\n0x%08lx: ", (U)p); + pg = NUM2PAGE(i); + if (pg->code < NUMSMALL) { + sprintf(buffer, "%d(%d)", SIZE(pg->code), pg->count); + printf("%8s", buffer); + lbig = 0; + } else + switch (pg->code & ~FREE) { + case BIG: + printf(" %c %-5d", pg->code & FREE ? 'F' : 'B', pg->page - i); + lbig = i; + break; + case MIDDLE: + if (pg->page == lbig) + printf(" <--> "); + else + printf(" ?--? "); + break; + case BASE: + printf(pg->code & FREE ? " GAP " : " BASE "); + lbig = 0; + break; + default: + printf(" ???? "); + lbig = 0; + break; + } + } + for (i = 0; i < NUMSMALL; i++) { + printf("\n\nSIZE %4d: local %d", SIZE(i), freelists[i].count); + if (freelists[i].free) { + printf("[%p", freelists[i].free); + for (cp = freelists[i].free->next; cp; cp = cp->next) + printf(", %p", cp); + printf("]"); + } + if (freelists[i].aux) { + printf(" + [%p", freelists[i].aux); + for (cp = freelists[i].aux->next; cp; cp = cp->next) + printf(", %p", cp); + printf("]"); + } + printf("\n\t global "); + for (j = membase->freechunks[i]; j; j = pg->page) { + pg = NUM2PAGE(j); + if (pg->code != i) + printf("", pg->code); + printf("%d[", pg->count); + if (pg->count) + for (cp = PAGEADDR(j) + pg->free * SIZE(i); cp; cp = cp->next) + printf("%p%s", cp, cp->next ? ", " : "]"); + if (pg->page) + printf(" + "); + } + } + printf("\n\nBIG: "); + for (p1 = membase->freepages; p1; p1 = p1->bigger) { + printf("\t%d[%p", p1->size, p1); + for (p2 = p1->next; p2; p2 = p2->next) + printf(", %p", p2); + printf("]\n"); + } + printf("\n\n\n"); +} diff --git a/src/shm_malloc/malloc.h b/src/shm_malloc/malloc.h new file mode 100644 index 0000000..9f564d3 --- /dev/null +++ b/src/shm_malloc/malloc.h @@ -0,0 +1,171 @@ +#ifndef _malloc_h_ +#define _malloc_h_ +#include + +#if defined(LOCKTYPE) && LOCKTYPE == SYSVSEM +#include +#include +#endif /* SYSVSEM */ +#if defined(LOCKTYPE) && LOCKTYPE == SPINLOCK +#include "atomic.h" +#endif /* SPINLOCK */ +#if defined(LOCKTYPE) && LOCKTYPE == PMUTEX +#include +#endif /* PMUTEX */ + +/* PAGESIZE MUST be a constant and MUST be a power of 2. It may be larger +** than the actual machine page size, but probably can't be smaller +** Total heap memory is limited to PAGESIZE * 2^32, and must in fact be in +** one contiguous extent that size or smaller (so if brk/sbrk gives you holes +** you'll get less memory.) PAGESIZE must be <= sizeof(void *) * 2^11, (8K +** on a 32-bit machine, 16K on 64-bit) though the bitfields could be rearranged +** to allow up to sizeof(void *) * 2^12 fairly easily. +*/ +#if defined(__alpha__) +#define PAGESIZE 8192 +#else +#define PAGESIZE 4096 +#endif + +enum { + size4, + size8 +#if defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ > 4 + = 0 +#endif + , + size16 +#if defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ > 8 + = 0 +#endif + , + size32, + size64, + size128, + size256, + size512, + size1024, + size2048, + size4096, + size8192, + size16384, + size32768, + size65536, + BIG = 26, + MIDDLE = 28, + BASE = 30, + FREE = 1 +}; +#define CAT(A, B) A##B +#define XCAT(A, B) CAT(A, B) + +/* number of small (<1 page) block sizes supported */ +#define NUMSMALL XCAT(size, PAGESIZE) +#define LOG2PAGESIZE (NUMSMALL + (8 - size256)) +#define SIZE(l) ((1 << (8 - size256)) << (l)) +#define PERPAGE(l) ((PAGESIZE / (1 << (8 - size256))) >> (l)) + +struct page { /* descriptor for a page */ + unsigned page; /* page number of next page with same chunksize + * for small chunk pages. + * page number after end of extent for BIG + * page number of start of extent for MIDDLE */ + unsigned free : 11; /* offset of first free chunk on page. + * 0 for non small chunk pages */ + int count : 12; /* number of free chunks on page. + * 0 for non small chunk pages */ + unsigned code : 8; /* code describing size of objects on page + * 1024 && (sz) <= 1024 ? size1024 \ + : PAGESIZE > 2048 && (sz) <= 2048 ? size2048 \ + : PAGESIZE > 4096 && (sz) <= 4096 ? size4096 \ + : PAGESIZE > 8192 && (sz) <= 8192 ? size8192 \ + : PAGESIZE > 16384 && (sz) <= 16384 ? size16384 \ + : PAGESIZE > 32768 && (sz) <= 32768 ? size32768 \ + : -1) +/* use the MALLOC macro with a CONSTANT argument for fast mallocs, less than +** half a page. Will crash if greater than half a page. No speed advantage +** if the argument is not constant */ +#ifndef MALLOC_DEBUG +#define MALLOC(sz) malloc_small(SMLIST(sz)) +#else +#define MALLOC(sz) (assert((sz) <= PAGESIZE / 2), malloc(sz)) +#endif + +#endif /* _malloc_h_ */ diff --git a/src/shm_malloc/shm_malloc.h b/src/shm_malloc/shm_malloc.h new file mode 100644 index 0000000..3b5e737 --- /dev/null +++ b/src/shm_malloc/shm_malloc.h @@ -0,0 +1,28 @@ +#ifndef _shm_malloc_h_ +#define _shm_malloc_h_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int shm_init(const char *, void (*)()), shm_fini(), shm_destroy(), + shm_child(); + +extern void *shm_malloc(size_t), *shm_calloc(size_t, size_t), + *shm_realloc(void *, size_t), *shm_valloc(size_t); +extern void shm_free(void *); +extern int shm_mresize(void *, size_t); +extern size_t shm_msize(void *); + +extern void *shm_global(), shm_set_global(void *); + +#define FIRST_TWO_ARGS(a, b, ...) a, b +#define shm_init(...) shm_init(FIRST_TWO_ARGS(__VA_ARGS__, 0)) + +#ifdef __cplusplus +} +#endif + +#endif /* _shm_malloc_h_ */ diff --git a/src/shm_malloc/tshm1.c b/src/shm_malloc/tshm1.c new file mode 100644 index 0000000..8b1b23b --- /dev/null +++ b/src/shm_malloc/tshm1.c @@ -0,0 +1,63 @@ +#include "shm_malloc.h" +#include +#include +#include + +#ifndef SHM_FILE +#define SHM_FILE "tshm1_file" +#endif + +struct list { + struct list *next; + char *data; +}; + +struct head { + struct list *head; + struct list **tail; +}; + +void setup() { + struct head *h = shm_malloc(sizeof(struct head)); + if (!h) { + perror("shm_malloc"); + exit(1); + } + h->head = 0; + h->tail = &h->head; + shm_set_global(h); +} + +int main(int ac, char **av) { + int i; + struct head *h; + struct list *l; + + if (shm_init(SHM_FILE, setup) < 0) { + perror("shm_init"); + exit(1); + } + h = shm_global(); + if (!h) { + perror("shm_global"); + exit(1); + } + for (i = 1; i < ac; i++) { + if (!(l = shm_malloc(sizeof(struct list)))) { + perror("shm_malloc"); + exit(1); + } + l->next = 0; + if (!(l->data = shm_malloc(strlen(av[i]) + 1))) { + perror("shm_malloc"); + exit(1); + } + strcpy(l->data, av[i]); + *h->tail = l; + h->tail = &l->next; + } + for (l = h->head; l; l = l->next) { + printf("%s\n", l->data); + } + return 0; +} diff --git a/src/shm_malloc/tshm2.c b/src/shm_malloc/tshm2.c new file mode 100644 index 0000000..a8594b0 --- /dev/null +++ b/src/shm_malloc/tshm2.c @@ -0,0 +1,128 @@ +#include "shm_malloc.h" +#include +#include +#include + +struct list { + struct list *next; +}; + +#define MAX_PATTERNS 100 +struct pattern { + long size, count; +} test[MAX_PATTERNS]; +int patterns = 0; + +long getval(char *s, char **p) { + long rv = strtol(s, p, 0); + char ch = **p; + if (ch == 'k' || ch == 'K') { + rv *= 1024; + ++*p; + } else if (ch == 'm' || ch == 'M') { + rv *= 1024 * 1024; + ++*p; + } else if (ch == 'g' || ch == 'G') { + rv *= 1024 * 1024 * 1024; + ++*p; + } + return rv; +} + +int main(int ac, char **av) { + int i, j; + struct list *head = 0, **tail = &head, *l, *next; + const char *file = 0; + long total = 0, maxcnt = 0, count; + int suffix; + + for (i = 1; i < ac; i++) { + if (isdigit(*av[i]) && patterns < MAX_PATTERNS) { + char *p; + long v = getval(av[i], &p); + if (*p == 'x' || *p == 'X') { + test[patterns].count = v; + test[patterns].size = getval(p + 1, &p); + } else { + test[patterns].count = 1; + test[patterns].size = v; + } + if (*p) { + fprintf(stderr, "ignoring bad pattern: %s\n", av[i]); + } else { + if (test[patterns].count > maxcnt) + maxcnt = test[patterns].count; + total += test[patterns].count * test[patterns].size; + patterns++; + } + } else if (!file) { + file = av[i]; + } else { + patterns = 0; + break; + } + } + if (patterns == 0) { + fprintf(stderr, "usage: %s [file] pattern...\n", av[0]); + exit(1); + } + if (shm_init(file) < 0) { + perror("shm_init"); + exit(1); + } + suffix = 0; + while (total > 2000 && suffix < 4) { + suffix++; + total = (total + 512) / 1024; + } + printf("Attempting %d patterns for %ld%c total\n", patterns, total, + " KMGT"[suffix]); + total = 0; + for (i = 0; i < maxcnt; i++) + for (j = 0; j < patterns; j++) + if (i < test[j].count) { + *tail = shm_malloc(test[j].size); + if (*tail) { + total += test[j].size; + tail = &(*tail)->next; + } else { + suffix = 0; + while (total > 2000 && suffix < 4) { + suffix++; + total = (total + 512) / 1024; + } + printf("Alloc failed after %ld%c\n", total, " KMGT"[suffix]); + i = maxcnt; + break; + } + } + *tail = 0; + printf("Done with allocation, now freeing\n"); + count = 0; + for (l = head; l; l = next) { + next = l->next; + if (next) { + l->next = next->next; + shm_free(next); + if (++count == 1000) { + putchar('.'); + fflush(stdout); + count = 0; + } + next = l->next; + } + } + printf("\nFreed half, now freeing remainder\n"); + count = 0; + for (l = head; l; l = next) { + next = l->next; + shm_free(l); + if (++count == 1000) { + putchar('.'); + fflush(stdout); + count = 0; + } + } + printf("\n"); + return 0; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..ae6441d --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,10 @@ +add_executable(test test.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES}) +target_link_libraries(test shm pthread rt) + +add_executable(test_client test.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES}) +target_compile_definitions(test_client PUBLIC USE_CLIENT) +target_link_libraries(test_client shm pthread rt) + +add_executable(test_spatter test_spatter.c parse-args.c json.c sp_alloc.c pcg_basic.c backend-support-tests.c ${CLIENT_SOURCE_FILES} ${SHARED_SOURCE_FILES}) +target_compile_definitions(test_spatter PUBLIC USE_SERIAL) +target_link_libraries(test_spatter shm pthread rt argtable3) diff --git a/tests/backend-support-tests.c b/tests/backend-support-tests.c new file mode 100644 index 0000000..c4bf540 --- /dev/null +++ b/tests/backend-support-tests.c @@ -0,0 +1,75 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ + +#include "backend-support-tests.h" + +int sg_cuda_support() { +#if defined USE_CUDA + return 1; +#else + return 0; +#endif +} + +int sg_opencl_support() { +#if defined USE_OPENCL + return 1; +#else + return 0; +#endif +} + +int sg_openmp_support() { +#if defined USE_OPENMP + return 1; +#else + return 0; +#endif +} + +int sg_serial_support() { +#if defined USE_SERIAL + return 1; +#else + return 0; +#endif +} diff --git a/tests/json.c b/tests/json.c new file mode 100644 index 0000000..679120d --- /dev/null +++ b/tests/json.c @@ -0,0 +1,940 @@ +/* vim: set et ts=3 sw=3 sts=3 ft=c: + * + * Copyright (C) 2012, 2013, 2014 James McLaughlin et al. All rights reserved. + * https://github.com/udp/json-parser + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "json.h" + +#ifdef _MSC_VER +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#endif + +const struct _json_value json_value_none; + +#include +#include +#include +#include + +typedef unsigned int json_uchar; + +static unsigned char hex_value(json_char c) { + if (isdigit(c)) + return c - '0'; + + switch (c) { + case 'a': + case 'A': + return 0x0A; + case 'b': + case 'B': + return 0x0B; + case 'c': + case 'C': + return 0x0C; + case 'd': + case 'D': + return 0x0D; + case 'e': + case 'E': + return 0x0E; + case 'f': + case 'F': + return 0x0F; + default: + return 0xFF; + } +} + +typedef struct { + unsigned long used_memory; + + unsigned int uint_max; + unsigned long ulong_max; + + json_settings settings; + int first_pass; + + const json_char *ptr; + unsigned int cur_line, cur_col; + +} json_state; + +static void *default_alloc(size_t size, int zero, void *user_data) { + return zero ? calloc(1, size) : malloc(size); +} + +static void default_free(void *ptr, void *user_data) { free(ptr); } + +static void *json_alloc(json_state *state, unsigned long size, int zero) { + if ((state->ulong_max - state->used_memory) < size) + return 0; + + if (state->settings.max_memory && + (state->used_memory += size) > state->settings.max_memory) { + return 0; + } + + return state->settings.mem_alloc(size, zero, state->settings.user_data); +} + +static int new_value(json_state *state, json_value **top, json_value **root, + json_value **alloc, json_type type) { + json_value *value; + int values_size; + + if (!state->first_pass) { + value = *top = *alloc; + *alloc = (*alloc)->_reserved.next_alloc; + + if (!*root) + *root = value; + + switch (value->type) { + case json_array: + + if (value->u.array.length == 0) + break; + + if (!(value->u.array.values = (json_value **)json_alloc( + state, value->u.array.length * sizeof(json_value *), 0))) { + return 0; + } + + value->u.array.length = 0; + break; + + case json_object: + + if (value->u.object.length == 0) + break; + + values_size = sizeof(*value->u.object.values) * value->u.object.length; + + if (!(value->u.object.values = (json_object_entry *)json_alloc( + state, values_size + ((unsigned long)value->u.object.values), + 0))) { + return 0; + } + + value->_reserved.object_mem = + (*(char **)&value->u.object.values) + values_size; + + value->u.object.length = 0; + break; + + case json_string: + + if (!(value->u.string.ptr = (json_char *)json_alloc( + state, (value->u.string.length + 1) * sizeof(json_char), 0))) { + return 0; + } + + value->u.string.length = 0; + break; + + default: + break; + }; + + return 1; + } + + if (!(value = (json_value *)json_alloc( + state, sizeof(json_value) + state->settings.value_extra, 1))) { + return 0; + } + + if (!*root) + *root = value; + + value->type = type; + value->parent = *top; + +#ifdef JSON_TRACK_SOURCE + value->line = state->cur_line; + value->col = state->cur_col; +#endif + + if (*alloc) + (*alloc)->_reserved.next_alloc = value; + + *alloc = *top = value; + + return 1; +} + +#define whitespace \ + case '\n': \ + ++state.cur_line; \ + state.cur_col = 0; \ + case ' ': \ + case '\t': \ + case '\r' + +#define string_add(b) \ + do { \ + if (!state.first_pass) \ + string[string_length] = b; \ + ++string_length; \ + } while (0); + +#define line_and_col state.cur_line, state.cur_col + +static const long flag_next = 1 << 0, flag_reproc = 1 << 1, + flag_need_comma = 1 << 2, flag_seek_value = 1 << 3, + flag_escaped = 1 << 4, flag_string = 1 << 5, + flag_need_colon = 1 << 6, flag_done = 1 << 7, + flag_num_negative = 1 << 8, flag_num_zero = 1 << 9, + flag_num_e = 1 << 10, flag_num_e_got_sign = 1 << 11, + flag_num_e_negative = 1 << 12, flag_line_comment = 1 << 13, + flag_block_comment = 1 << 14; + +json_value *json_parse_ex(json_settings *settings, const json_char *json, + size_t length, char *error_buf) { + json_char error[json_error_max]; + const json_char *end; + json_value *top, *root, *alloc = 0; + json_state state = {0}; + long flags; + long num_digits = 0, num_e = 0; + json_int_t num_fraction = 0; + + /* Skip UTF-8 BOM + */ + if (length >= 3 && ((unsigned char)json[0]) == 0xEF && + ((unsigned char)json[1]) == 0xBB && ((unsigned char)json[2]) == 0xBF) { + json += 3; + length -= 3; + } + + error[0] = '\0'; + end = (json + length); + + memcpy(&state.settings, settings, sizeof(json_settings)); + + if (!state.settings.mem_alloc) + state.settings.mem_alloc = default_alloc; + + if (!state.settings.mem_free) + state.settings.mem_free = default_free; + + memset(&state.uint_max, 0xFF, sizeof(state.uint_max)); + memset(&state.ulong_max, 0xFF, sizeof(state.ulong_max)); + + state.uint_max -= 8; /* limit of how much can be added before next check */ + state.ulong_max -= 8; + + for (state.first_pass = 1; state.first_pass >= 0; --state.first_pass) { + json_uchar uchar; + unsigned char uc_b1, uc_b2, uc_b3, uc_b4; + json_char *string = 0; + unsigned int string_length = 0; + + top = root = 0; + flags = flag_seek_value; + + state.cur_line = 1; + + for (state.ptr = json;; ++state.ptr) { + json_char b = (state.ptr == end ? 0 : *state.ptr); + + if (flags & flag_string) { + if (!b) { + sprintf(error, "Unexpected EOF in string (at %d:%d)", line_and_col); + goto e_failed; + } + + if (string_length > state.uint_max) + goto e_overflow; + + if (flags & flag_escaped) { + flags &= ~flag_escaped; + + switch (b) { + case 'b': + string_add('\b'); + break; + case 'f': + string_add('\f'); + break; + case 'n': + string_add('\n'); + break; + case 'r': + string_add('\r'); + break; + case 't': + string_add('\t'); + break; + case 'u': + + if (end - state.ptr <= 4 || + (uc_b1 = hex_value(*++state.ptr)) == 0xFF || + (uc_b2 = hex_value(*++state.ptr)) == 0xFF || + (uc_b3 = hex_value(*++state.ptr)) == 0xFF || + (uc_b4 = hex_value(*++state.ptr)) == 0xFF) { + sprintf(error, "Invalid character value `%c` (at %d:%d)", b, + line_and_col); + goto e_failed; + } + + uc_b1 = (uc_b1 << 4) | uc_b2; + uc_b2 = (uc_b3 << 4) | uc_b4; + uchar = (uc_b1 << 8) | uc_b2; + + if ((uchar & 0xF800) == 0xD800) { + json_uchar uchar2; + + if (end - state.ptr <= 6 || (*++state.ptr) != '\\' || + (*++state.ptr) != 'u' || + (uc_b1 = hex_value(*++state.ptr)) == 0xFF || + (uc_b2 = hex_value(*++state.ptr)) == 0xFF || + (uc_b3 = hex_value(*++state.ptr)) == 0xFF || + (uc_b4 = hex_value(*++state.ptr)) == 0xFF) { + sprintf(error, "Invalid character value `%c` (at %d:%d)", b, + line_and_col); + goto e_failed; + } + + uc_b1 = (uc_b1 << 4) | uc_b2; + uc_b2 = (uc_b3 << 4) | uc_b4; + uchar2 = (uc_b1 << 8) | uc_b2; + + uchar = 0x010000 | ((uchar & 0x3FF) << 10) | (uchar2 & 0x3FF); + } + + if (sizeof(json_char) >= sizeof(json_uchar) || (uchar <= 0x7F)) { + string_add((json_char)uchar); + break; + } + + if (uchar <= 0x7FF) { + if (state.first_pass) + string_length += 2; + else { + string[string_length++] = 0xC0 | (uchar >> 6); + string[string_length++] = 0x80 | (uchar & 0x3F); + } + + break; + } + + if (uchar <= 0xFFFF) { + if (state.first_pass) + string_length += 3; + else { + string[string_length++] = 0xE0 | (uchar >> 12); + string[string_length++] = 0x80 | ((uchar >> 6) & 0x3F); + string[string_length++] = 0x80 | (uchar & 0x3F); + } + + break; + } + + if (state.first_pass) + string_length += 4; + else { + string[string_length++] = 0xF0 | (uchar >> 18); + string[string_length++] = 0x80 | ((uchar >> 12) & 0x3F); + string[string_length++] = 0x80 | ((uchar >> 6) & 0x3F); + string[string_length++] = 0x80 | (uchar & 0x3F); + } + + break; + + default: + string_add(b); + }; + + continue; + } + + if (b == '\\') { + flags |= flag_escaped; + continue; + } + + if (b == '"') { + if (!state.first_pass) + string[string_length] = 0; + + flags &= ~flag_string; + string = 0; + + switch (top->type) { + case json_string: + + top->u.string.length = string_length; + flags |= flag_next; + + break; + + case json_object: + + if (state.first_pass) + (*(json_char **)&top->u.object.values) += string_length + 1; + else { + top->u.object.values[top->u.object.length].name = + (json_char *)top->_reserved.object_mem; + + top->u.object.values[top->u.object.length].name_length = + string_length; + + (*(json_char **)&top->_reserved.object_mem) += string_length + 1; + } + + flags |= flag_seek_value | flag_need_colon; + continue; + + default: + break; + }; + } else { + string_add(b); + continue; + } + } + + if (state.settings.settings & json_enable_comments) { + if (flags & (flag_line_comment | flag_block_comment)) { + if (flags & flag_line_comment) { + if (b == '\r' || b == '\n' || !b) { + flags &= ~flag_line_comment; + --state.ptr; /* so null can be reproc'd */ + } + + continue; + } + + if (flags & flag_block_comment) { + if (!b) { + sprintf(error, "%d:%d: Unexpected EOF in block comment", + line_and_col); + goto e_failed; + } + + if (b == '*' && state.ptr < (end - 1) && state.ptr[1] == '/') { + flags &= ~flag_block_comment; + ++state.ptr; /* skip closing sequence */ + } + + continue; + } + } else if (b == '/') { + if (!(flags & (flag_seek_value | flag_done)) && + top->type != json_object) { + sprintf(error, "%d:%d: Comment not allowed here", line_and_col); + goto e_failed; + } + + if (++state.ptr == end) { + sprintf(error, "%d:%d: EOF unexpected", line_and_col); + goto e_failed; + } + + switch (b = *state.ptr) { + case '/': + flags |= flag_line_comment; + continue; + + case '*': + flags |= flag_block_comment; + continue; + + default: + sprintf(error, "%d:%d: Unexpected `%c` in comment opening sequence", + line_and_col, b); + goto e_failed; + }; + } + } + + if (flags & flag_done) { + if (!b) + break; + + switch (b) { + whitespace: + continue; + + default: + + sprintf(error, "%d:%d: Trailing garbage: `%c`", state.cur_line, + state.cur_col, b); + + goto e_failed; + }; + } + + if (flags & flag_seek_value) { + switch (b) { + whitespace: + continue; + + case ']': + + if (top && top->type == json_array) + flags = (flags & ~(flag_need_comma | flag_seek_value)) | flag_next; + else { + sprintf(error, "%d:%d: Unexpected ]", line_and_col); + goto e_failed; + } + + break; + + default: + + if (flags & flag_need_comma) { + if (b == ',') { + flags &= ~flag_need_comma; + continue; + } else { + sprintf(error, "%d:%d: Expected , before %c", state.cur_line, + state.cur_col, b); + + goto e_failed; + } + } + + if (flags & flag_need_colon) { + if (b == ':') { + flags &= ~flag_need_colon; + continue; + } else { + sprintf(error, "%d:%d: Expected : before %c", state.cur_line, + state.cur_col, b); + + goto e_failed; + } + } + + flags &= ~flag_seek_value; + + switch (b) { + case '{': + + if (!new_value(&state, &top, &root, &alloc, json_object)) + goto e_alloc_failure; + + continue; + + case '[': + + if (!new_value(&state, &top, &root, &alloc, json_array)) + goto e_alloc_failure; + + flags |= flag_seek_value; + continue; + + case '"': + + if (!new_value(&state, &top, &root, &alloc, json_string)) + goto e_alloc_failure; + + flags |= flag_string; + + string = top->u.string.ptr; + string_length = 0; + + continue; + + case 't': + + if ((end - state.ptr) < 3 || *(++state.ptr) != 'r' || + *(++state.ptr) != 'u' || *(++state.ptr) != 'e') { + goto e_unknown_value; + } + + if (!new_value(&state, &top, &root, &alloc, json_boolean)) + goto e_alloc_failure; + + top->u.boolean = 1; + + flags |= flag_next; + break; + + case 'f': + + if ((end - state.ptr) < 4 || *(++state.ptr) != 'a' || + *(++state.ptr) != 'l' || *(++state.ptr) != 's' || + *(++state.ptr) != 'e') { + goto e_unknown_value; + } + + if (!new_value(&state, &top, &root, &alloc, json_boolean)) + goto e_alloc_failure; + + flags |= flag_next; + break; + + case 'n': + + if ((end - state.ptr) < 3 || *(++state.ptr) != 'u' || + *(++state.ptr) != 'l' || *(++state.ptr) != 'l') { + goto e_unknown_value; + } + + if (!new_value(&state, &top, &root, &alloc, json_null)) + goto e_alloc_failure; + + flags |= flag_next; + break; + + default: + + if (isdigit(b) || b == '-') { + if (!new_value(&state, &top, &root, &alloc, json_integer)) + goto e_alloc_failure; + + if (!state.first_pass) { + while (isdigit(b) || b == '+' || b == '-' || b == 'e' || + b == 'E' || b == '.') { + if ((++state.ptr) == end) { + b = 0; + break; + } + + b = *state.ptr; + } + + flags |= flag_next | flag_reproc; + break; + } + + flags &= ~(flag_num_negative | flag_num_e | flag_num_e_got_sign | + flag_num_e_negative | flag_num_zero); + + num_digits = 0; + num_fraction = 0; + num_e = 0; + + if (b != '-') { + flags |= flag_reproc; + break; + } + + flags |= flag_num_negative; + continue; + } else { + sprintf(error, "%d:%d: Unexpected %c when seeking value", + line_and_col, b); + goto e_failed; + } + }; + }; + } else { + switch (top->type) { + case json_object: + + switch (b) { + whitespace: + continue; + + case '"': + + if (flags & flag_need_comma) { + sprintf(error, "%d:%d: Expected , before \"", line_and_col); + goto e_failed; + } + + flags |= flag_string; + + string = (json_char *)top->_reserved.object_mem; + string_length = 0; + + break; + + case '}': + + flags = (flags & ~flag_need_comma) | flag_next; + break; + + case ',': + + if (flags & flag_need_comma) { + flags &= ~flag_need_comma; + break; + } + + default: + sprintf(error, "%d:%d: Unexpected `%c` in object", line_and_col, b); + goto e_failed; + }; + + break; + + case json_integer: + case json_double: + + if (isdigit(b)) { + ++num_digits; + + if (top->type == json_integer || flags & flag_num_e) { + if (!(flags & flag_num_e)) { + if (flags & flag_num_zero) { + sprintf(error, "%d:%d: Unexpected `0` before `%c`", + line_and_col, b); + goto e_failed; + } + + if (num_digits == 1 && b == '0') + flags |= flag_num_zero; + } else { + flags |= flag_num_e_got_sign; + num_e = (num_e * 10) + (b - '0'); + continue; + } + + top->u.integer = (top->u.integer * 10) + (b - '0'); + continue; + } + + num_fraction = (num_fraction * 10) + (b - '0'); + continue; + } + + if (b == '+' || b == '-') { + if ((flags & flag_num_e) && !(flags & flag_num_e_got_sign)) { + flags |= flag_num_e_got_sign; + + if (b == '-') + flags |= flag_num_e_negative; + + continue; + } + } else if (b == '.' && top->type == json_integer) { + if (!num_digits) { + sprintf(error, "%d:%d: Expected digit before `.`", line_and_col); + goto e_failed; + } + + top->type = json_double; + top->u.dbl = (double)top->u.integer; + + num_digits = 0; + continue; + } + + if (!(flags & flag_num_e)) { + if (top->type == json_double) { + if (!num_digits) { + sprintf(error, "%d:%d: Expected digit after `.`", line_and_col); + goto e_failed; + } + + top->u.dbl += + ((double)num_fraction) / (pow(10.0, (double)num_digits)); + } + + if (b == 'e' || b == 'E') { + flags |= flag_num_e; + + if (top->type == json_integer) { + top->type = json_double; + top->u.dbl = (double)top->u.integer; + } + + num_digits = 0; + flags &= ~flag_num_zero; + + continue; + } + } else { + if (!num_digits) { + sprintf(error, "%d:%d: Expected digit after `e`", line_and_col); + goto e_failed; + } + + top->u.dbl *= pow( + 10.0, (double)(flags & flag_num_e_negative ? -num_e : num_e)); + } + + if (flags & flag_num_negative) { + if (top->type == json_integer) + top->u.integer = -top->u.integer; + else + top->u.dbl = -top->u.dbl; + } + + flags |= flag_next | flag_reproc; + break; + + default: + break; + }; + } + + if (flags & flag_reproc) { + flags &= ~flag_reproc; + --state.ptr; + } + + if (flags & flag_next) { + flags = (flags & ~flag_next) | flag_need_comma; + + if (!top->parent) { + /* root value done */ + + flags |= flag_done; + continue; + } + + if (top->parent->type == json_array) + flags |= flag_seek_value; + + if (!state.first_pass) { + json_value *parent = top->parent; + + switch (parent->type) { + case json_object: + + parent->u.object.values[parent->u.object.length].value = top; + + break; + + case json_array: + + parent->u.array.values[parent->u.array.length] = top; + + break; + + default: + break; + }; + } + + if ((++top->parent->u.array.length) > state.uint_max) + goto e_overflow; + + top = top->parent; + + continue; + } + } + + alloc = root; + } + + return root; + +e_unknown_value: + + sprintf(error, "%d:%d: Unknown value", line_and_col); + goto e_failed; + +e_alloc_failure: + + strcpy(error, "Memory allocation failure"); + goto e_failed; + +e_overflow: + + sprintf(error, "%d:%d: Too long (caught overflow)", line_and_col); + goto e_failed; + +e_failed: + + if (error_buf) { + if (*error) + strcpy(error_buf, error); + else + strcpy(error_buf, "Unknown error"); + } + + if (state.first_pass) + alloc = root; + + while (alloc) { + top = alloc->_reserved.next_alloc; + state.settings.mem_free(alloc, state.settings.user_data); + alloc = top; + } + + if (!state.first_pass) + json_value_free_ex(&state.settings, root); + + return 0; +} + +json_value *json_parse(const json_char *json, size_t length) { + json_settings settings = {0}; + return json_parse_ex(&settings, json, length, 0); +} + +void json_value_free_ex(json_settings *settings, json_value *value) { + json_value *cur_value; + + if (!value) + return; + + value->parent = 0; + + while (value) { + switch (value->type) { + case json_array: + + if (!value->u.array.length) { + settings->mem_free(value->u.array.values, settings->user_data); + break; + } + + value = value->u.array.values[--value->u.array.length]; + continue; + + case json_object: + + if (!value->u.object.length) { + settings->mem_free(value->u.object.values, settings->user_data); + break; + } + + value = value->u.object.values[--value->u.object.length].value; + continue; + + case json_string: + + settings->mem_free(value->u.string.ptr, settings->user_data); + break; + + default: + break; + }; + + cur_value = value; + value = value->parent; + settings->mem_free(cur_value, settings->user_data); + } +} + +void json_value_free(json_value *value) { + json_settings settings = {0}; + settings.mem_free = default_free; + json_value_free_ex(&settings, value); +} diff --git a/tests/parse-args.c b/tests/parse-args.c new file mode 100644 index 0000000..539ef88 --- /dev/null +++ b/tests/parse-args.c @@ -0,0 +1,1447 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ + +#include "parse-args.h" +#include "argtable3.h" +#include "backend-support-tests.h" +#include "json.h" +#include "pcg_basic.h" +#include "sp_alloc.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef USE_CUDA +#include "../src/cuda/cuda-backend.h" +#endif + +#ifdef USE_OPENMP +#include +#endif + +#ifdef USE_PAPI +#include "papi_helper.h" +int papi_nevents; +char papi_event_names[PAPI_MAX_COUNTERS][STRING_SIZE]; +#endif + +#define INTERACTIVE "INTERACTIVE" + +char platform_string[STRING_SIZE]; +char device_string[STRING_SIZE]; +char kernel_file[STRING_SIZE]; +char kernel_name[STRING_SIZE]; +char jsonfilename[STRING_SIZE]; +char op_string[STRING_SIZE]; + +int cuda_dev = -1; +int validate_flag = 0; +int quiet_flag = 0; +int aggregate_flag = 1; +int compress_flag = 0; +int stride_kernel = -1; + +enum sg_backend backend = INVALID_BACKEND; + +// These should actually stay global +int verbose; +FILE *err_file; + +void safestrcopy(char *dest, const char *src); +void parse_p(char *, struct run_config *, int mode); +ssize_t setincludes(size_t key, size_t *set, size_t set_len); +void xkp_pattern(size_t *pat, size_t dim); +void parse_backend(int argc, char **argv); + +void **argtable; +unsigned int number_of_arguments = 35; +struct arg_lit *verb, *help, *interactive, *validate, *aggregate, *compress; +struct arg_str *backend_arg, *cl_platform, *cl_device, *pattern, + *pattern_gather, *pattern_scatter, *kernelName, *delta, *delta_gather, + *delta_scatter, *name, *papi, *op; +struct arg_int *count, *wrap, *runs, *omp_threads, *vector_len, + *local_work_size, *shared_memory, *morton, *hilbert, *roblock, *stride, + *random_arg, *no_print_header; +struct arg_file *kernelFile; +struct arg_end *end; + +void initialize_argtable() { + // Initialize the argtable on the stack just because it is easier and how the + // documentation handles it + void **malloc_argtable = + (void **)malloc(sizeof(void *) * number_of_arguments); + + // Arguments that do not take parameters + malloc_argtable[0] = help = arg_litn( + NULL, "help", 0, 1, "Displays info about commands and then exits."); + malloc_argtable[1] = verb = arg_litn( + NULL, "verbose", 0, 1, + "Print info about default arguments that you have not overridden."); + malloc_argtable[2] = no_print_header = arg_intn( + "q", "no-print-header", "", 0, 1, "Do not print header information."); + malloc_argtable[3] = interactive = + arg_litn("i", "interactive", 0, 1, + "Pick the platform and the device interactively."); + malloc_argtable[4] = validate = + arg_litn(NULL, "validate", 0, 1, + "Perform extra validation checks to ensure data validity"); + malloc_argtable[5] = aggregate = + arg_litn("a", "aggregate", 0, 1, + "Report a minimum time for all runs of a given configuration " + "for 2 or more runs. [Default 1] (Do not use with PAPI)"); + malloc_argtable[6] = compress = arg_litn("c", "compress", 0, 1, "TODO"); + // Benchmark Configuration + malloc_argtable[7] = pattern = arg_strn( + "p", "pattern", "", 0, 1, + "Specify either a built-in pattern (i.e. UNIFORM), a custom pattern " + "(i.e. 1,2,3,4), or a path to a json file with a run-configuration."); + malloc_argtable[8] = pattern_gather = + arg_strn("g", "pattern-gather", "", 0, 1, + "Valid wtih [kernel-name: GS]. Specify either a built-in " + "pattern (i.e. UNIFORM), a custom pattern (i.e. 1,2,3,4), or a " + "path to a json file with a run-configuration."); + malloc_argtable[9] = pattern_scatter = + arg_strn("h", "pattern-scatter", "", 0, 1, + "Valid with [kernel-name: GS]. Specify either a built-in " + "pattern (i.e. UNIFORM), a custom pattern (i.e. 1,2,3,4), or a " + "path to a json file with a run-configuration."); + malloc_argtable[10] = kernelName = + arg_strn("k", "kernel-name", "", 0, 1, + "Specify the kernel you want to run. [Default: Gather]"); + malloc_argtable[11] = op = arg_strn("o", "op", "", 0, 1, "TODO"); + malloc_argtable[12] = delta = + arg_strn("d", "delta", "", 0, 1, + "Specify one or more deltas. [Default: 8]"); + malloc_argtable[13] = delta_gather = + arg_strn("x", "delta gather", "", 0, 1, + "Specify one or more deltas. [Default: 8]"); + malloc_argtable[14] = delta_scatter = + arg_strn("y", "delta scatter", "", 0, 1, + "Specify one or more deltas. [Default: 8]"); + malloc_argtable[15] = count = arg_intn( + "l", "count", "", 0, 1, "Number of Gathers or Scatters to perform."); + malloc_argtable[16] = wrap = + arg_intn("w", "wrap", "", 0, 1, + "Number of independent slots in the small buffer (source buffer " + "if Scatter, Target buffer if Gather. [Default: 1]"); + malloc_argtable[17] = runs = arg_intn( + "R", "runs", "", 0, 1, + "Number of times to repeat execution of the kernel. [Default: 10]"); + malloc_argtable[18] = omp_threads = + arg_intn("t", "omp-threads", "", 0, 1, + "Number of OpenMP threads. [Default: OMP_MAX_THREADS]"); + malloc_argtable[19] = vector_len = + arg_intn("v", "vector-len", "", 0, 1, "TODO"); + malloc_argtable[20] = local_work_size = arg_intn( + "z", "local-work-size", "", 0, 1, + "Numer of Gathers or Scatters performed by each thread on a GPU."); + malloc_argtable[21] = shared_memory = + arg_intn("m", "shared-memory", "", 0, 1, + "Amount of dummy shared memory to allocate on GPUs (used for " + "occupancy control)."); + malloc_argtable[22] = name = + arg_strn("n", "name", "", 0, 1, + "Specify and name this configuration in the output."); + malloc_argtable[23] = random_arg = + arg_intn("s", "random", "", 0, 1, + "Sets the seed, or uses a random one if no seed is specified."); + malloc_argtable[24] = backend_arg = + arg_strn("b", "backend", "", 0, 1, + "Specify a backend: OpenCL, OpenMP, CUDA, or Serial."); + malloc_argtable[25] = cl_platform = arg_strn( + NULL, "cl-platform", "", 0, 1, + "Specify platform if using OpenCL (case-insensitive, fuzzy matching)."); + malloc_argtable[26] = cl_device = arg_strn( + NULL, "cl-device", "", 0, 1, + "Specify device if using OpenCL (case-insensitive, fuzzy matching)."); + malloc_argtable[27] = kernelFile = + arg_filen("f", "kernel-file", "", 0, 1, + "Specify the location of an OpenCL kernel file."); + // Other Configurations + malloc_argtable[28] = morton = arg_intn(NULL, "morton", "", 0, 1, "TODO"); + malloc_argtable[29] = hilbert = + arg_intn(NULL, "hilbert", "", 0, 1, "TODO"); + malloc_argtable[30] = roblock = + arg_intn(NULL, "roblock", "", 0, 1, "TODO"); + malloc_argtable[31] = stride = arg_intn(NULL, "stride", "", 0, 1, "TODO"); + malloc_argtable[32] = papi = arg_strn(NULL, "papi", "", 0, 1, "TODO"); + malloc_argtable[33] = end = arg_end(20); + + // Random has an option to provide an argument. Default its value to -1. + random_arg->hdr.flag |= ARG_HASOPTVALUE; + random_arg->ival[0] = -1; + + // Set default values + kernelName->sval[0] = "Gather\0"; + delta->sval[0] = "8\0"; + delta_gather->sval[0] = "8\0"; + delta_scatter->sval[0] = "8\0"; + wrap->ival[0] = 1; + runs->ival[0] = 10; + + // Set the global argtable equal to the malloc argtable + argtable = malloc_argtable; +} + +void copy_str_ignore_leading_space(char *dest, const char *source) { + if (source[0] == ' ') + safestrcopy(dest, &source[1]); + else + safestrcopy(dest, source); +} + +int get_num_configs(json_value *value) { + if (value->type != json_array) { + error("get_num_configs was not passed an array", ERROR); + } + + return value->u.array.length; +} + +void parse_json_kernel(json_object_entry cur, char **argv, int i) { + if (!strcasecmp(cur.value->u.string.ptr, "SCATTER") || + !strcasecmp(cur.value->u.string.ptr, "GATHER") || + !strcasecmp(cur.value->u.string.ptr, "GS")) { + error("Ambiguous Kernel Type: Assuming kernel-name option.", WARN); + snprintf(argv[i + 1], STRING_SIZE, "--kernel-name=%s", + cur.value->u.string.ptr); + } else { + error("Ambigous Kernel Type: Assuming kernel-file option.", WARN); + snprintf(argv[i + 1], STRING_SIZE, "--kernel-file=%s", + cur.value->u.string.ptr); + } +} + +void parse_json_array(json_object_entry cur, char **argv, int i) { + int index = 0; + index += snprintf(argv[i + 1], STRING_SIZE, "--%s=", cur.name); + printf("argv[%d]: %s\n", i + 1, argv[i + 1]); + + for (int j = 0; j < cur.value->u.array.length; j++) { + if (cur.value->u.array.values[j]->type != json_integer) { + error("Encountered non-integer json type while parsing array", ERROR); + } + + char buffer[STRING_SIZE]; + int check = snprintf(buffer, STRING_SIZE, "%zd", + cur.value->u.array.values[j]->u.integer); + int added = snprintf(buffer, STRING_SIZE - index, "%zd", + cur.value->u.array.values[j]->u.integer); + + if (check == added) { + index += snprintf(&argv[i + 1][index], STRING_SIZE - index, "%zd", + cur.value->u.array.values[j]->u.integer); + + if (index >= STRING_SIZE - 1) { + break; + } else if (j != cur.value->u.array.length - 1 && + index < STRING_SIZE - 1) { + index += snprintf(&argv[i + 1][index], STRING_SIZE - index, ","); + } + + } else { + index--; + argv[i + 1][index] = '\0'; + break; + } + } +} + +struct run_config *parse_json_config(json_value *value) { + + struct run_config *rc = + (struct run_config *)calloc(1, sizeof(struct run_config)); + + if (!value) + error("parse_json_config passed NULL pointer", ERROR); + + if (value->type != json_object) + error("parse_json_config should only be passed json_objects", ERROR); + + int argc = value->u.object.length + 1; + char **argv = (char **)sp_malloc(sizeof(char *), argc * 2, ALIGN_CACHE); + + for (int i = 0; i < argc; i++) + argv[i] = (char *)sp_malloc(1, STRING_SIZE * 2, ALIGN_CACHE); + + for (int i = 0; i < argc - 1; i++) { + json_object_entry cur = value->u.object.values[i]; + + if (cur.value->type == json_string) { + if (!strcasecmp(cur.name, "kernel")) { + parse_json_kernel(cur, argv, i); + } else { + snprintf(argv[i + 1], STRING_SIZE, "--%s=%s", cur.name, + cur.value->u.string.ptr); + } + } else if (cur.value->type == json_integer) { + snprintf(argv[i + 1], STRING_SIZE, "--%s=%zd", cur.name, + cur.value->u.integer); + } else if (cur.value->type == json_array) { + parse_json_array(cur, argv, i); + } else { + error("Unexpected json type", ERROR); + } + } + + // yeah its hacky - parse_args ignores the first arg + safestrcopy(argv[0], argv[1]); + + int nerrors = arg_parse(argc, argv, argtable); + + if (nerrors > 0) { + arg_print_errors(stdout, end, "Spatter"); + printf("Error while parsing json file.\n"); + exit(0); + } + + rc = parse_runs(argc, argv); + + for (int i = 0; i < argc; i++) + free(argv[i]); + + free(argv); + + return rc; +} + +void parse_args(int argc, char **argv, int *nrc, struct run_config **rc) { + initialize_argtable(); + int nerrors = arg_parse(argc, argv, argtable); + + if (help->count > 0) { + printf("Usage:\n"); + arg_print_syntax(stdout, argtable, "\n"); + arg_print_glossary(stdout, argtable, " %-28s %s\n"); + exit(0); + } + + if (nerrors > 0) { + arg_print_errors(stdout, end, "Spatter"); + printf("Try './spatter --help' for more information.\n"); + exit(0); + } + + parse_backend(argc, argv); + + // Parse command-line arguments to in case of specified json file. + int json = 0; + + if (pattern->count > 0) { + if (strstr(pattern->sval[0], "FILE")) { + safestrcopy(jsonfilename, strchr(pattern->sval[0], '=') + 1); + printf("Reading patterns from %s.\n", jsonfilename); + json = 1; + } + } + + if (json) { + FILE *fp; + struct stat filestatus; + int file_size; + char *file_contents; + json_char *json; + json_value *value; + + if (stat(jsonfilename, &filestatus) != 0) + error("Json file not found", ERROR); + + file_size = filestatus.st_size; + file_contents = (char *)sp_malloc(file_size, 1 + 1, ALIGN_CACHE); + + fp = fopen(jsonfilename, "rt"); + if (!fp) + error("Unable to open Json file", ERROR); + + if (fread(file_contents, file_size, 1, fp) != 1) { + fclose(fp); + error("Unable to read content of Json file", ERROR); + } + fclose(fp); + + json = (json_char *)file_contents; + value = json_parse(json, file_size); + + if (!value) + error("Unable to parse Json file", ERROR); + + // This is the number of specified runs in the json file. + *nrc = get_num_configs(value); + + *rc = (struct run_config *)sp_calloc(sizeof(struct run_config), *nrc, + ALIGN_CACHE); + + for (int i = 0; i < *nrc; i++) { + struct run_config *rctemp = parse_json_config(value->u.array.values[i]); + rc[0][i] = *rctemp; + free(rctemp); + } + + json_value_free(value); + free(file_contents); + } else { + *rc = (struct run_config *)sp_calloc(sizeof(struct run_config), 1, + ALIGN_CACHE); + rc[0][0] = *parse_runs(argc, argv); + *nrc = 1; + } + + free(argtable); + + return; +} + +struct run_config *parse_runs(int argc, char **argv) { + int pattern_found = 0; + int pattern_scatter_found = 0; + int pattern_gather_found = 0; + + struct run_config *rc = + (struct run_config *)calloc(1, sizeof(struct run_config)); + rc->delta = -1; + rc->delta_gather = -1; + rc->delta_scatter = -1; + rc->stride_kernel = -1; + rc->ro_block = 1; + rc->ro_order = NULL; +#ifdef USE_OPENMP + rc->omp_threads = omp_get_max_threads(); +#else + rc->omp_threads = 1; +#endif + rc->kernel = INVALID_KERNEL; + safestrcopy(rc->name, "NONE"); + + if (kernelName->count > 0) { + copy_str_ignore_leading_space(kernel_name, kernelName->sval[0]); + if (!strcasecmp("GS", kernel_name)) + rc->kernel = GS; + else if (!strcasecmp("SCATTER", kernel_name)) + rc->kernel = SCATTER; + else if (!strcasecmp("GATHER", kernel_name)) + rc->kernel = GATHER; + else { + char output[STRING_SIZE]; + sprintf(output, "Invalid kernel %s\n", kernel_name); + error(output, ERROR); + } + } + + if (op->count > 0) { + copy_str_ignore_leading_space(op_string, op->sval[0]); + if (!strcasecmp("COPY", op_string)) + rc->op = OP_COPY; + else if (!strcasecmp("ACCUM", op_string)) + rc->op = OP_ACCUM; + else + error("Unrecognzied op type", ERROR); + } + + if (random_arg->count > 0) { + // Parsing the seed parameter + // If no argument was passed, use the current time in seconds since the + // epoch as the random seed + if (random_arg->ival[0] == -1) + rc->random_seed = time(NULL); + else + // sscanf(optarg, "%zu", rc->random_seed); + rc->random_seed = random_arg->ival[0]; + } + + if (omp_threads->count > 0) + rc->omp_threads = omp_threads->ival[0]; + + if (vector_len->count > 0) { + rc->vector_len = vector_len->ival[0]; + if (rc->vector_len < 1) + error("Invalid vector len!", ERROR); + } + + if (runs->count > 0) + rc->nruns = runs->ival[0]; + + if (wrap->count > 0) + rc->wrap = wrap->ival[0]; + + if (count->count > 0) + rc->generic_len = count->ival[0]; + + if (local_work_size->count > 0) + rc->local_work_size = local_work_size->ival[0]; + + if (shared_memory->count > 0) + rc->shmem = shared_memory->ival[0]; + + if (name->count > 0) + copy_str_ignore_leading_space(rc->name, name->sval[0]); + + if (pattern->count > 0) { + copy_str_ignore_leading_space(rc->generator, pattern->sval[0]); + // char* filePtr = strstr(rc->generator, "FILE"); + // if (filePtr) + // safestrcopy(rc->generator, filePtr); + parse_p(rc->generator, rc, 0); + pattern_found = 1; + } + + if (pattern_gather->count > 0) { + copy_str_ignore_leading_space(rc->generator, pattern_gather->sval[0]); + parse_p(rc->generator, rc, 1); + pattern_gather_found = 1; + } + + if (pattern_scatter->count > 0) { + copy_str_ignore_leading_space(rc->generator, pattern_scatter->sval[0]); + parse_p(rc->generator, rc, 2); + pattern_scatter_found = 1; + } + + if (delta->count > 0) { + char delta_temp[STRING_SIZE]; + copy_str_ignore_leading_space(delta_temp, delta->sval[0]); + char *delim = ","; + char *ptr = strtok(delta_temp, delim); + if (!ptr) + error("Pattern not found", ERROR); + + spIdx_t *mydeltas; + spIdx_t *mydeltas_ps; + + mydeltas = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + mydeltas_ps = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + + size_t read = 0; + if (sscanf(ptr, "%zu", &(mydeltas[read++])) < 1) + error("Failed to parse first pattern element in deltas", ERROR); + + while ((ptr = strtok(NULL, delim)) && read < MAX_PATTERN_LEN) { + if (sscanf(ptr, "%zu", &(mydeltas[read++])) < 1) + error("Failed to parse pattern", ERROR); + } + + rc->deltas = mydeltas; + rc->deltas_ps = mydeltas_ps; + rc->deltas_len = read; + + // rotate + for (size_t i = 0; i < rc->deltas_len; i++) + rc->deltas_ps[i] = + rc->deltas[((i - 1) + rc->deltas_len) % rc->deltas_len]; + + // compute prefix-sum + for (size_t i = 1; i < rc->deltas_len; i++) + rc->deltas_ps[i] += rc->deltas_ps[i - 1]; + + // compute max + size_t m = rc->deltas_ps[0]; + for (size_t i = 1; i < rc->deltas_len; i++) { + if (rc->deltas_ps[i] > m) + m = rc->deltas_ps[i]; + } + rc->delta = m; + } + + if (delta_gather->count > 0) { + char delta_gather_temp[STRING_SIZE]; + copy_str_ignore_leading_space(delta_gather_temp, delta_gather->sval[0]); + char *delim_gather = ","; + char *ptr_gather = strtok(delta_gather_temp, delim_gather); + if (!ptr_gather) + error("Pattern not found", ERROR); + + spIdx_t *mydeltas_gather; + spIdx_t *mydeltas_gather_ps; + + mydeltas_gather = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + mydeltas_gather_ps = + sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + + size_t read_gather = 0; + if (sscanf(ptr_gather, "%zu", &(mydeltas_gather[read_gather++])) < 1) + error("Failed to parse first pattern element in deltas", ERROR); + + while ((ptr_gather = strtok(NULL, delim_gather)) && + read_gather < MAX_PATTERN_LEN) { + if (sscanf(ptr_gather, "%zu", &(mydeltas_gather[read_gather++])) < 1) + error("Failed to parse pattern", ERROR); + } + + rc->deltas_gather = mydeltas_gather; + rc->deltas_gather_ps = mydeltas_gather_ps; + rc->deltas_gather_len = read_gather; + + // rotate + for (size_t i = 0; i < rc->deltas_gather_len; i++) + rc->deltas_gather_ps[i] = + rc->deltas_gather[((i - 1) + rc->deltas_gather_len) % + rc->deltas_gather_len]; + + // compute prefix-sum + for (size_t i = 1; i < rc->deltas_gather_len; i++) + rc->deltas_gather_ps[i] += rc->deltas_gather_ps[i - 1]; + + // compute max + size_t m = rc->deltas_gather_ps[0]; + for (size_t i = 1; i < rc->deltas_gather_len; i++) { + if (rc->deltas_gather_ps[i] > m) + m = rc->deltas_gather_ps[i]; + } + rc->delta_gather = m; + } + + if (delta_scatter->count > 0) { + char delta_scatter_temp[STRING_SIZE]; + copy_str_ignore_leading_space(delta_scatter_temp, delta_scatter->sval[0]); + char *delim_scatter = ","; + char *ptr_scatter = strtok(delta_scatter_temp, delim_scatter); + if (!ptr_scatter) + error("Pattern not found", ERROR); + + spIdx_t *mydeltas_scatter; + spIdx_t *mydeltas_scatter_ps; + + mydeltas_scatter = sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + mydeltas_scatter_ps = + sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + + size_t read_scatter = 0; + if (sscanf(ptr_scatter, "%zu", &(mydeltas_scatter[read_scatter++])) < 1) + error("Failed to parse first pattern element in deltas", ERROR); + + while ((ptr_scatter = strtok(NULL, delim_scatter)) && + read_scatter < MAX_PATTERN_LEN) { + if (sscanf(ptr_scatter, "%zu", &(mydeltas_scatter[read_scatter++])) < 1) + error("Failed to parse pattern", ERROR); + } + + rc->deltas_scatter = mydeltas_scatter; + rc->deltas_scatter_ps = mydeltas_scatter_ps; + rc->deltas_scatter_len = read_scatter; + + // rotate + for (size_t i = 0; i < rc->deltas_scatter_len; i++) + rc->deltas_scatter_ps[i] = + rc->deltas_scatter[((i - 1) + rc->deltas_scatter_len) % + rc->deltas_scatter_len]; + + // compute prefix-sum + for (size_t i = 1; i < rc->deltas_scatter_len; i++) + rc->deltas_scatter_ps[i] += rc->deltas_scatter_ps[i - 1]; + + // compute max + size_t m = rc->deltas_scatter_ps[0]; + for (size_t i = 1; i < rc->deltas_scatter_len; i++) { + if (rc->deltas_scatter_ps[i] > m) + m = rc->deltas_scatter_ps[i]; + } + rc->delta_scatter = m; + } + + if (morton->count > 0) + rc->ro_morton = morton->ival[0]; + + if (hilbert->count > 0) + rc->ro_hilbert = hilbert->ival[0]; + + if (roblock->count > 0) + rc->ro_block = roblock->ival[0]; + + if (stride->count > 0) + rc->stride_kernel = stride->ival[0]; + + // VALIDATE ARGUMENTS + if (rc->kernel != GS && !pattern_found) + error("Please specify a pattern", ERROR); + + if ((rc->kernel == GS && !pattern_scatter_found) || + (rc->kernel == GS && !pattern_gather_found)) + error("Please specify a gather pattern and a scatter pattern for an GS " + "kernel", + ERROR); + + if (rc->kernel == GS && (rc->pattern_gather_len != rc->pattern_scatter_len)) + error("Gather pattern and scatter pattern must have the same length", + ERROR); + + if (rc->vector_len == 0) { + error("Vector length not set. Default is 1", WARN); + rc->vector_len = 1; + } + + if (rc->wrap == 0) { + error("length of smallbuf not specified. Default is 1 (slot of size " + "pattern_len elements)", + WARN); + rc->wrap = 1; + } + + if (rc->nruns == 0) { + error("Number of runs not specified. Default is 10 ", WARN); + rc->nruns = 10; + } + + if (rc->generic_len == 0) { + error("Length not specified. Default is 1024 (gathers/scatters)", WARN); + rc->generic_len = 1024; + } + + if (rc->kernel == INVALID_KERNEL) { + error("Kernel unspecified, guess GATHER", WARN); + rc->kernel = GATHER; + safestrcopy(kernel_name, "gather"); + } + + if (rc->kernel == SCATTER) + sprintf(kernel_name, "%s%zu", "scatter", rc->vector_len); + else if (rc->kernel == GATHER) + sprintf(kernel_name, "%s%zu", "gather", rc->vector_len); + else if (rc->kernel == GS) + sprintf(kernel_name, "%s%zu", "sg", rc->vector_len); + + if (pattern_found) { + if (rc->delta <= -1) { + error("delta not specified, default is 8\n", WARN); + rc->delta = 8; + rc->deltas_len = 1; + } + } + + if (pattern_gather_found) { + if (rc->delta_gather <= -1) { + error("delta gather not specified, default is 8\n", WARN); + rc->delta_gather = 8; + rc->deltas_gather_len = 1; + } + } + + if (pattern_scatter_found) { + if (rc->delta_scatter <= -1) { + error("delta scatter not specified, default is 8\n", WARN); + rc->delta_scatter = 8; + rc->deltas_scatter_len = 1; + } + } + + if (rc->op != OP_COPY) + error("OP must be OP_COPY", WARN); + + if (!strcasecmp(rc->name, "NONE")) { + if (rc->type != CUSTOM) + safestrcopy(rc->name, rc->generator); + else + safestrcopy(rc->name, "CUSTOM"); + } + +#ifdef USE_OPENMP + int max_threads = omp_get_max_threads(); + if (rc->omp_threads > max_threads) { + error("Too many OpenMP threads requested, using the max instead", WARN); + rc->omp_threads = max_threads; + } + if (rc->omp_threads == 0) { + error("Number of OpenMP threads not specified, using the max", WARN); + rc->omp_threads = max_threads; + } +#else + if (rc->omp_threads > 1) + error("Compiled without OpenMP support but requsted more than 1 thread, " + "using 1 instead", + WARN); +#endif + +#if defined USE_CUDA || defined USE_OPENCL + if (rc->local_work_size == 0) { + error("Local_work_size not set. Default is 1", WARN); + rc->local_work_size = 1; + } +#endif + return rc; +} + +ssize_t power(int base, int exp) { + int i, result = 1; + for (i = 0; i < exp; i++) + result *= base; + return result; +} + +// Yes, there is no need for recursion here but I did this in python first. I'll +// update this later with a cleaner implementation +void static laplacian_branch(int depth, int order, int n, int **pos, + int *pos_len) { + *pos = (int *)realloc(*pos, ((*pos_len) + order) * sizeof(int)); + + for (int i = 0; i < order; i++) { + (*pos)[i + *pos_len] = (i + 1) * power(n, depth); + } + + *pos_len += order; + return; +} + +void static laplacian(int dim, int order, int n, struct run_config *rc, + int mode) { + spIdx_t **pattern; + spSize_t *pattern_len; + + if (mode == 0) { // Normal pattern + pattern = &rc->pattern; + pattern_len = &rc->pattern_len; + } else if (mode == 1) { // Gather pattern (GS Kernel) + pattern = &rc->pattern_gather; + pattern_len = &rc->pattern_gather_len; + } else if (mode == 2) { // Scatter pattern (GS Kernel) + pattern = &rc->pattern_scatter; + pattern_len = &rc->pattern_scatter_len; + } else { + printf("laplacian: invalid mode %d\n", mode); + exit(1); + } + + if (dim < 1) { + error("laplacian: dim must be positive", ERROR); + } + + int final_len = dim * order * 2 + 1; + if (final_len > MAX_PATTERN_LEN) { + error("laplacian: resulting pattern too long", ERROR); + } + + int pos_len = 0; + int *pos = NULL; + + for (int i = 0; i < dim; i++) { + laplacian_branch(i, order, n, &pos, &pos_len); + } + + *pattern_len = final_len; + + *pattern = sp_calloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE); + + int max = pos[pos_len - 1]; + + for (int i = 0; i < *pattern_len; i++) { + (*pattern)[i] = 2; + } + + // populate rc->pattern + for (int i = 0; i < pos_len; i++) { + (*pattern)[i] = (-pos[pos_len - i - 1] + max); + } + + (*pattern)[pos_len] = max; + + for (int i = 0; i < pos_len; i++) { + (*pattern)[pos_len + 1 + i] = pos[i] + max; + } + + free(pos); + return; +} + +void parse_backend(int argc, char **argv) { + err_file = stderr; + + safestrcopy(platform_string, "NONE"); + safestrcopy(device_string, "NONE"); + safestrcopy(kernel_file, "NONE"); + safestrcopy(kernel_name, "NONE"); + + if (backend_arg->count > 0) { + if (!strcasecmp("OPENCL", backend_arg->sval[0])) + backend = OPENCL; + else if (!strcasecmp("OPENMP", backend_arg->sval[0])) + backend = OPENMP; + else if (!strcasecmp("CUDA", backend_arg->sval[0])) + backend = CUDA; + else if (!strcasecmp("SERIAL", backend_arg->sval[0])) + backend = SERIAL; + else + error("Unrecognized Backend", ERROR); + } + + if (cl_platform->count > 0) + copy_str_ignore_leading_space(platform_string, cl_platform->sval[0]); + + if (cl_device->count > 0) + copy_str_ignore_leading_space(device_string, cl_device->sval[0]); + + if (interactive->count > 0) { + safestrcopy(platform_string, INTERACTIVE); + safestrcopy(device_string, INTERACTIVE); + } + + if (kernelFile->count > 0) + copy_str_ignore_leading_space(kernel_file, kernelFile->filename[0]); + + if (no_print_header->count > 0) + quiet_flag = no_print_header->ival[0]; + + if (validate->count > 0) + validate_flag++; + + if (aggregate->count > 0) + aggregate_flag = 1; + + if (compress->count > 0) + compress_flag = 1; + + if (papi->count > 0) { +#ifdef USE_PAPI + { + char *pch = strtok(papi->sval[0], ","); + while (pch != NULL) { + safestrcopy(papi_event_names[papi_nevents++], pch); + pch = strtok(NULL, ","); + if (papi_nevents == PAPI_MAX_COUNTERS) + break; + } + } +#endif + } + + /* Check argument coherency */ + if (backend == INVALID_BACKEND) { + if (sg_cuda_support()) { + backend = CUDA; + error("No backend specified, guessing CUDA", WARN); + } else if (sg_opencl_support()) { + backend = OPENCL; + error("No backend specified, guessing OpenCL", WARN); + } else if (sg_openmp_support()) { + backend = OPENMP; + error("No backend specified, guessing OpenMP", WARN); + } else if (sg_serial_support()) { + backend = SERIAL; + error("No backend specified, guessing Serial", WARN); + } else + error("No backends available! Please recompile spatter with at least one " + "backend.", + ERROR); + } + + // Check to see if they compiled with support for their requested backend + if (backend == OPENCL) { + if (!sg_opencl_support()) + error("You did not compile with support for OpenCL", ERROR); + } else if (backend == OPENMP) { + if (!sg_openmp_support()) + error("You did not compile with support for OpenMP", ERROR); + } else if (backend == CUDA) { + if (!sg_cuda_support()) + error("You did not compile with support for CUDA", ERROR); + } else if (backend == SERIAL) { + if (!sg_serial_support()) + error("You did not compile with support for serial execution", ERROR); + } + + if (backend == OPENCL) { + if (!strcasecmp(platform_string, "NONE")) { + safestrcopy(platform_string, INTERACTIVE); + safestrcopy(device_string, INTERACTIVE); + } + if (!strcasecmp(device_string, "NONE")) { + safestrcopy(platform_string, INTERACTIVE); + safestrcopy(device_string, INTERACTIVE); + } + } + +#ifdef USE_CUDA + if (backend == CUDA) { + int dev = find_device_cuda(device_string); + if (dev == -1) { + error("Specified CUDA device not found or no device specified. Using " + "device 0", + WARN); + dev = 0; + } + cuda_dev = dev; + cudaSetDevice(dev); + } +#endif + + if (!strcasecmp(kernel_file, "NONE") && backend == OPENCL) { + error("Kernel file unspecified, guessing kernels/kernels_vector.cl", WARN); + safestrcopy(kernel_file, "kernels/kernels_vector.cl"); + } + + return; +} + +void parse_p(char *optarg, struct run_config *rc, int mode) { + spIdx_t **pattern; + spSize_t *pattern_len; + ssize_t *delta; + size_t **deltas; + size_t *deltas_len; + + if (mode == 0) { // Normal pattern + pattern = &rc->pattern; + pattern_len = &rc->pattern_len; + delta = &rc->delta; + deltas = &rc->deltas_gather; + deltas_len = &rc->deltas_len; + } else if (mode == 1) { // Gather pattern (GS Kernel) + pattern = &rc->pattern_gather; + pattern_len = &rc->pattern_gather_len; + delta = &rc->delta_gather; + deltas = &rc->deltas_gather; + deltas_len = &rc->deltas_gather_len; + } else if (mode == 2) { // Scatter pattern (GS Kernel) + pattern = &rc->pattern_scatter; + pattern_len = &rc->pattern_scatter_len; + delta = &rc->delta_scatter; + deltas = &rc->deltas_scatter; + deltas_len = &rc->deltas_scatter_len; + } else { + printf("parse_p: invalid mode %d\n", mode); + exit(1); + } + + rc->type = INVALID_IDX; + char *arg = 0; + if ((arg = strchr(optarg, ':'))) { + *arg = '\0'; + arg++; // arg now points to arguments to the pattern type + + // FILE mode indicates that we will load a + // config from a file + if (!strcmp(optarg, "FILE")) { + // TODO + // safestrcopy(idx_pattern_file, arg); + rc->type = CONFIG_FILE; + } + + // The Exxon Kernel Proxy-derived stencil + // It used to be called HYDRO so we will accept that too + // XKP:dim + else if (!strcmp(optarg, "XKP") || !strcmp(optarg, "HYDRO")) { + rc->type = XKP; + + size_t dim = 0; + char *dim_char = strtok(arg, ":"); + if (!dim_char) + error("XKP: size not found", 1); + if (sscanf(dim_char, "%zu", &dim) < 1) + error("XKP: Dimension not parsed", 1); + + *pattern_len = 73; + + *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE); + + // The default delta is 1 + *delta = 1; + + if (!(*deltas)) { + *deltas = sp_malloc(sizeof(size_t), 1, ALIGN_CACHE); + } + *deltas[0] = *delta; + *deltas_len = 1; + + xkp_pattern(*pattern, dim); + } + + // Parse Uniform Stride Arguments, which are + // UNIFORM:index_length:stride + else if (!strcmp(optarg, "UNIFORM")) { + rc->type = UNIFORM; + + // Read the length + char *len = strtok(arg, ":"); + if (!len) + error("UNIFORM: Index Length not found", 1); + if (sscanf(len, "%zu", &(*pattern_len)) < 1) + error("UNIFORM: Length not parsed", 1); + + *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE); + + // Read the stride + char *stride = strtok(NULL, ":"); + ssize_t strideval = 0; + if (!stride) + error("UNIFORM: Stride not found", 1); + if (sscanf(stride, "%zd", &strideval) < 1) + error("UNIFORM: Stride not parsed", 1); + + // Fill the pattern buffer + for (int i = 0; i < *pattern_len; i++) + (*pattern)[i] = i * strideval; + + char *delta2 = strtok(NULL, ":"); + if (delta2) { + if (!*deltas) { + *deltas = sp_malloc(sizeof(size_t), 1, ALIGN_CACHE); + } + *deltas_len = 1; + + if (!strcmp(delta2, "NR")) { + *delta = strideval * (*pattern_len); + (*deltas)[0] = *delta; + } else { + if (sscanf(delta2, "%zd", &(*delta)) < 1) + error("UNIFORM: delta not parsed", 1); + (*deltas)[0] = *delta; + } + } + + } + + // LAPLACIAN:DIM:ORDER:N + else if (!strcmp(optarg, "LAPLACIAN")) { + int dim_val, order_val, problem_size_val; + + *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE); + rc->type = LAPLACIAN; + + // Read the dimension + char *dim = strtok(arg, ":"); + if (!dim) + error("LAPLACIAN: Dimension not found", 1); + if (sscanf(dim, "%d", &dim_val) < 1) + error("LAPLACIAN: Dimension not parsed", 1); + + // Read the order + char *order = strtok(NULL, ":"); + if (!order) + error("LAPLACIAN: Order not found", 1); + if (sscanf(order, "%d", &order_val) < 1) + error("LAPLACIAN: Order not parsed", 1); + + // Read the problem size + char *problem_size = strtok(NULL, ":"); + if (!problem_size) + error("LAPLACIAN: Problem size not found", 1); + if (sscanf(problem_size, "%d", &problem_size_val) < 1) + error("LAPLACIAN: Problem size not parsed", 1); + + *delta = 1; + if (!(*deltas)) { + *deltas = sp_malloc(sizeof(spIdx_t), *delta, ALIGN_CACHE); + } + (*deltas)[0] = *delta; + *deltas_len = 1; + + laplacian(dim_val, order_val, problem_size_val, rc, mode); + } + + // Mostly Stride 1 Mode + // Arguments: index_length:list_of_breaks:list_of_deltas + // list_of_deltas should be length 1 or the same length as + // list_of_breaks. + // The elements of both lists should be nonnegative and + // the the elements of list_of_breaks should be strictly less + // than index_length + else if (!strcmp(optarg, "MS1")) { + rc->type = MS1; + + char *len = strtok(arg, ":"); + char *breaks = strtok(NULL, ":"); + char *gaps = strtok(NULL, ":"); + + size_t *ms1_breaks = + sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + size_t *ms1_deltas = + sp_malloc(sizeof(size_t), MAX_PATTERN_LEN, ALIGN_CACHE); + size_t ms1_breaks_len = 0; + size_t ms1_deltas_len = 0; + + // Parse index length + sscanf(len, "%zu", &(*pattern_len)); + *pattern = sp_malloc(sizeof(spIdx_t), *pattern_len, ALIGN_CACHE); + + // Parse breaks + char *ptr = strtok(breaks, ","); + size_t read = 0; + if (!ptr) + error("MS1: Breaks missing", 1); + if (sscanf(ptr, "%zu", &(ms1_breaks[read++])) < 1) + error("MS1: Failed to parse first break", 1); + + while ((ptr = strtok(NULL, ",")) && read < MAX_PATTERN_LEN) { + if (sscanf(ptr, "%zu", &(ms1_breaks[read++])) < 1) + error("MS1: Failed to parse breaks", 1); + } + + ms1_breaks_len = read; + + if (!gaps) { + printf("1\n"); + error("error", ERROR); + } + + ptr = strtok(gaps, ","); + read = 0; + if (ptr) { + if (sscanf(ptr, "%zu", &(ms1_deltas[read++])) < 1) + error("Failed to parse first delta", 1); + + while ((ptr = strtok(NULL, ",")) && read < MAX_PATTERN_LEN) { + if (sscanf(ptr, "%zu", &(ms1_deltas[read++])) < 1) + error("Failed to parse deltas", 1); + } + } else + error("MS1: deltas missing", 1); + + ms1_deltas_len = read; + + (*pattern)[0] = -1; + size_t last = -1; + ssize_t j; + for (int i = 0; i < *pattern_len; i++) { + if ((j = setincludes(i, ms1_breaks, ms1_breaks_len)) != -1) + (*pattern)[i] = last + ms1_deltas[ms1_deltas_len > 1 ? j : 0]; + else + (*pattern)[i] = last + 1; + last = (*pattern)[i]; + } + + free(ms1_breaks); + free(ms1_deltas); + } else + error("Unrecognized mode in -p argument", 1); + } + + // CUSTOM mode means that the user supplied a single index buffer on the + // command line + else { + if (quiet_flag > 3) { + printf("Parse P Custom Pattern: %s\n", optarg); + } + rc->type = CUSTOM; + char *delim = ","; + char *ptr = strtok(optarg, delim); + size_t read = 0; + if (!ptr) + error("Pattern not found", 1); + + spIdx_t *mypat; + + mypat = sp_malloc(sizeof(spIdx_t), MAX_PATTERN_LEN, ALIGN_CACHE); + + if (sscanf(ptr, "%zu", &(mypat[read++])) < 1) + error("Failed to parse first pattern element in custom mode", 1); + + while ((ptr = strtok(NULL, delim)) && read < MAX_PATTERN_LEN) { + if (sscanf(ptr, "%zu", &(mypat[read++])) < 1) + error("Failed to parse pattern", 1); + } + *pattern = mypat; + *pattern_len = read; + } + + if (*pattern_len == 0) + error("Pattern length of 0", ERROR); + + if (rc->type == INVALID_IDX) + error("No pattern type set", ERROR); +} + +ssize_t setincludes(size_t key, size_t *set, size_t set_len) { + for (size_t i = 0; i < set_len; i++) { + if (set[i] == key) + return i; + } + return -1; +} + +void print_run_config(struct run_config rc) { + printf("Index: %zu ", rc.pattern_len); + printf("["); + for (size_t i = 0; i < rc.pattern_len; i++) { + printf("%zu", rc.pattern[i]); + if (i != rc.pattern_len - 1) + printf(" "); + } + printf("]\n"); + if (rc.deltas_len > 0) { + printf("Deltas: %zu ", rc.deltas_len); + printf("["); + for (size_t i = 0; i < rc.deltas_len; i++) { + printf("%zu", rc.deltas[i]); + if (i != rc.deltas_len - 1) + printf(" "); + } + printf("]\n"); + printf("Deltas_ps: %zu ", rc.deltas_len); + printf("["); + for (size_t i = 0; i < rc.deltas_len; i++) { + printf("%zu", rc.deltas_ps[i]); + if (i != rc.deltas_len - 1) + printf(" "); + } + printf("] (%zu)\n", rc.delta); + } else + printf("Delta: %zu\n", rc.delta); + + printf("kern: %s\n", kernel_name); + printf("genlen: %zu\n", rc.generic_len); +} + +void error(char *what, int code) { + if (code == ERROR) + fprintf(err_file, "Error: "); + else if (code == WARN) { + if (verbose) + fprintf(err_file, "Warning: "); + } + + if (verbose || code) { + fprintf(err_file, "%s", what); + fprintf(err_file, "\n"); + } + + if (code) + exit(code); +} + +void safestrcopy(char *dest, const char *src) { + dest[0] = '\0'; + strncat(dest, src, STRING_SIZE - 1); +} + +int compare_ssizet(const void *a, const void *b) { + if (*(ssize_t *)a > *(ssize_t *)b) + return 1; + else if (*(ssize_t *)a < *(ssize_t *)b) + return -1; + else + return 0; +} + +void copy4(ssize_t *dest, ssize_t *a, int *off) { + for (int i = 0; i < 4; i++) { + dest[i + *off] = a[i]; + } + *off += 4; +} + +void add4(ssize_t *dest, ssize_t *a, ssize_t *b, int *off) { + for (int i = 0; i < 4; i++) { + dest[i + *off] = a[i] + b[i]; + } + *off += 4; +} + +void xkp_pattern(size_t *pat_, size_t dim) { + ssize_t pat[73]; + for (int i = 0; i < 73; i++) { + pat[i] = i; + } + + ssize_t Xp[4]; + ssize_t Xn[4]; + ssize_t Yp[4]; + ssize_t Yn[4]; + ssize_t Zp[4]; + ssize_t Zn[4]; + + Xp[0] = 1; + Xp[1] = 2; + Xp[2] = 3; + Xp[3] = 4; + Xn[0] = -1; + Xn[1] = -2; + Xn[2] = -3; + Xn[3] = -4; + Yp[0] = dim; + Yp[1] = 2 * dim; + Yp[2] = 3 * dim; + Yp[3] = 4 * dim; + Yn[0] = -dim; + Yn[1] = -2 * dim; + Yn[2] = -3 * dim; + Yn[3] = -4 * dim; + Zp[0] = dim * dim; + Zp[1] = 2 * dim * dim; + Zp[2] = 3 * dim * dim; + Zp[3] = 4 * dim * dim; + Zn[0] = -dim * dim; + Zn[1] = -2 * dim * dim; + Zn[2] = -3 * dim * dim; + Zn[3] = -4 * dim * dim; + + int idx = 0; + pat[idx++] = 0; + copy4(pat, Xp, &idx); + copy4(pat, Xn, &idx); + copy4(pat, Yp, &idx); + copy4(pat, Yn, &idx); + copy4(pat, Zp, &idx); + copy4(pat, Zn, &idx); + + add4(pat, Xp, Yp, &idx); + add4(pat, Xp, Zp, &idx); + add4(pat, Xp, Yn, &idx); + add4(pat, Xp, Zn, &idx); + + add4(pat, Xn, Yp, &idx); + add4(pat, Xn, Zp, &idx); + add4(pat, Xn, Yn, &idx); + add4(pat, Xn, Zn, &idx); + + add4(pat, Yp, Zp, &idx); + add4(pat, Yp, Zn, &idx); + add4(pat, Yn, Zp, &idx); + add4(pat, Yn, Zn, &idx); + + qsort(pat, 73, sizeof(ssize_t), compare_ssizet); + + ssize_t min = pat[0]; + for (int i = 1; i < 73; i++) { + if (pat[i] < min) { + min = pat[i]; + } + } + + for (int i = 0; i < 73; i++) { + pat[i] -= min; + } + + for (int i = 0; i < 73; i++) { + pat_[i] = pat[i]; + } +} diff --git a/tests/pcg_basic.c b/tests/pcg_basic.c new file mode 100644 index 0000000..86e4ff3 --- /dev/null +++ b/tests/pcg_basic.c @@ -0,0 +1,106 @@ +/* + * PCG Random Number Generation for C. + * + * Copyright 2014 Melissa O'Neill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For additional information about the PCG random number generation scheme, + * including its license and other licensing options, visit + * + * http://www.pcg-random.org + */ + +/* + * This code is derived from the full C implementation, which is in turn + * derived from the canonical C++ PCG implementation. The C++ version + * has many additional features and is preferable if you can use C++ in + * your project. + */ + +#include "pcg_basic.h" + +// state for global RNGs + +static pcg32_random_t pcg32_global = PCG32_INITIALIZER; + +// pcg32_srandom(initstate, initseq) +// pcg32_srandom_r(rng, initstate, initseq): +// Seed the rng. Specified in two parts, state initializer and a +// sequence selection constant (a.k.a. stream id) + +void pcg32_srandom_r(pcg32_random_t *rng, uint64_t initstate, + uint64_t initseq) { + rng->state = 0U; + rng->inc = (initseq << 1u) | 1u; + pcg32_random_r(rng); + rng->state += initstate; + pcg32_random_r(rng); +} + +void pcg32_srandom(uint64_t seed, uint64_t seq) { + pcg32_srandom_r(&pcg32_global, seed, seq); +} + +// pcg32_random() +// pcg32_random_r(rng) +// Generate a uniformly distributed 32-bit random number + +uint32_t pcg32_random_r(pcg32_random_t *rng) { + uint64_t oldstate = rng->state; + rng->state = oldstate * 6364136223846793005ULL + rng->inc; + uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; + uint32_t rot = oldstate >> 59u; + return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); +} + +uint32_t pcg32_random() { return pcg32_random_r(&pcg32_global); } + +// pcg32_boundedrand(bound): +// pcg32_boundedrand_r(rng, bound): +// Generate a uniformly distributed number, r, where 0 <= r < bound + +uint32_t pcg32_boundedrand_r(pcg32_random_t *rng, uint32_t bound) { + // To avoid bias, we need to make the range of the RNG a multiple of + // bound, which we do by dropping output less than a threshold. + // A naive scheme to calculate the threshold would be to do + // + // uint32_t threshold = 0x100000000ull % bound; + // + // but 64-bit div/mod is slower than 32-bit div/mod (especially on + // 32-bit platforms). In essence, we do + // + // uint32_t threshold = (0x100000000ull-bound) % bound; + // + // because this version will calculate the same modulus, but the LHS + // value is less than 2^32. + + uint32_t threshold = -bound % bound; + + // Uniformity guarantees that this loop will terminate. In practice, it + // should usually terminate quickly; on average (assuming all bounds are + // equally likely), 82.25% of the time, we can expect it to require just + // one iteration. In the worst case, someone passes a bound of 2^31 + 1 + // (i.e., 2147483649), which invalidates almost 50% of the range. In + // practice, bounds are typically small and only a tiny amount of the range + // is eliminated. + for (;;) { + uint32_t r = pcg32_random_r(rng); + if (r >= threshold) + return r % bound; + } +} + +uint32_t pcg32_boundedrand(uint32_t bound) { + return pcg32_boundedrand_r(&pcg32_global, bound); +} diff --git a/tests/sp_alloc.c b/tests/sp_alloc.c new file mode 100644 index 0000000..1741e29 --- /dev/null +++ b/tests/sp_alloc.c @@ -0,0 +1,97 @@ +/* +© (or copyright) 2022. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for +Los Alamos National Laboratory (LANL), which is operated by Triad National +Security, LLC for the U.S. Department of Energy/National Nuclear Security +Administration. All rights in the program are reserved by Triad National +Security, LLC, and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others acting on its +behalf a nonexclusive, paid-up, irrevocable worldwide license in this material +to reproduce, prepare derivative works, distribute copies to the public, perform +publicly and display publicly, and to permit others to do so. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------ +Copyright (c) 2018, HPCGarage research group at Georgia Tech +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notices (both +LANL and GT), this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of spatter nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +*/ + +#include "sp_alloc.h" +#include "parse-args.h" //error +#include +#include //exit +#include //memset + +long long total_mem_used = 0; + +long long get_mem_used() { return total_mem_used; } +void check_size(size_t size) { + total_mem_used += size; + // printf("size: %zu\n", size); + if (total_mem_used > SP_MAX_ALLOC) { + error("Too much memory used.", ERROR); + } +} + +void check_safe_mult(size_t a, size_t b) { + int hi_bit_a = 0; + int hi_bit_b = 0; + + while (a >>= 1) + hi_bit_a++; + while (b >>= 1) + hi_bit_b++; + + if (hi_bit_a + hi_bit_b > sizeof(size_t) * 8) { + error("Error: Multiplication would overflow.", ERROR); + } +} + +void *sp_malloc(size_t size, size_t count, size_t align) { + check_safe_mult(size, count); + check_size(size * count); +#ifdef USE_POSIX_MEMALIGN + void *ptr = NULL; + int ret = posix_memalign(&ptr, align, size * count); + if (ret != 0) + ptr = NULL; +#else + void *ptr = aligned_alloc(align, size * count); +#endif + if (!ptr) { + printf("Attempted to allocate %zu bytes (%zu * %zu)\n", size * count, size, + count); + error("Error: failed to allocate memory", ERROR); + } + return ptr; +} + +void *sp_calloc(size_t size, size_t count, size_t align) { + void *ptr = sp_malloc(size, count, align); + memset(ptr, 0, size * count); + return ptr; +} diff --git a/tests/test.c b/tests/test.c new file mode 100644 index 0000000..30ceb13 --- /dev/null +++ b/tests/test.c @@ -0,0 +1,627 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "client.h" +#include "client_cleanup.h" +#include "client_init.h" +#include "client_memory.h" +#include "client_place_requests.h" +#include "config.h" +#include "kernels.h" +#include "shm_malloc.h" + +// functions for measuring execution time of read_data and write_data (adapted +// from https://stackoverflow.com/a/19898211) +struct timespec start_timer() { + struct timespec start_time; + clock_gettime(CLOCK_MONOTONIC, &start_time); + return start_time; +} + +// call this function to end a timer, returning nanoseconds elapsed as a long +uint64_t stop_timer(struct timespec start_time) { + struct timespec end_time; + clock_gettime(CLOCK_MONOTONIC, &end_time); + uint64_t diffInNanos = (end_time.tv_sec - start_time.tv_sec) * (uint64_t)1e9 + + (end_time.tv_nsec - start_time.tv_nsec); + return diffInNanos; +} + +#define TIME(cmd, elapsed_ns) \ + { \ + struct timespec start = start_timer(); \ + cmd; \ + elapsed_ns += stop_timer(start); \ + } + +#define SINGLE_ALLOC + +#ifdef SINGLE_ALLOC +double *res, *input, *buffer, *data; +size_t *ind1, *ind2, *tmp; +#endif + +#ifdef USE_CLIENT +struct client client; +int req_id = 0; +#else +// use regular malloc and free if we're not using the client model +#define shm_malloc malloc +#define shm_free free +#endif + +// mock API to interact with memory accelerator +double *read_data(const double *buffer, size_t N, const size_t *ind1, + const size_t *ind2, uint64_t *elapsed_ns, size_t num_threads, + bool use_avx) { +// Only time loops, not memory allocation or flow control +#ifndef SINGLE_ALLOC + double *res = (double *)shm_malloc(N * sizeof(double)); +#endif + + memset(res, 0, N * sizeof(double)); + +#ifdef USE_CLIENT + + struct request read_req; + + TIME( + { + scoria_read(&client, buffer, N, res, ind1, ind2, num_threads, use_avx, + &read_req); + wait_request(&client, &read_req); + }, + *elapsed_ns) + +#else + + if (ind1 == NULL) { + assert(ind2 == NULL); + if (num_threads == 0) { + TIME(read_single_thread_0(res, buffer, N, use_avx), *elapsed_ns) + } else { + TIME(read_multi_thread_0(res, buffer, N, num_threads, use_avx), + *elapsed_ns) + } + return res; + } + + if (ind2 == NULL) { + assert(ind1 != NULL); + if (num_threads == 0) { + TIME(read_single_thread_1(res, buffer, N, ind1, use_avx), *elapsed_ns) + } else { + TIME(read_multi_thread_1(res, buffer, N, ind1, num_threads, use_avx), + *elapsed_ns) + } + return res; + } + + assert(ind1 != NULL); + assert(ind2 != NULL); + + if (num_threads == 0) { + TIME(read_single_thread_2(res, buffer, N, ind1, ind2, use_avx), *elapsed_ns) + } else { + TIME(read_multi_thread_2(res, buffer, N, ind1, ind2, num_threads, use_avx), + *elapsed_ns) + } + +#endif + + return res; +} + +void write_data(double *buffer, size_t N, const double *input, + const size_t *ind1, const size_t *ind2, uint64_t *elapsed_ns, + size_t num_threads, bool use_avx) { +#ifdef USE_CLIENT + + struct request write_req; + + TIME( + { + scoria_write(&client, buffer, N, input, ind1, ind2, num_threads, + use_avx, &write_req); + wait_request(&client, &write_req); + }, + *elapsed_ns) + +#else + + // Only time loops, not memory allocation or flow control + if (ind1 == NULL) { + assert(ind2 == NULL); + if (num_threads == 0) { + TIME(write_single_thread_0(buffer, input, N, use_avx), *elapsed_ns) + } else { + TIME(write_multi_thread_0(buffer, input, N, num_threads, use_avx), + *elapsed_ns) + } + return; + } + + if (ind2 == NULL) { + assert(ind1 != NULL); + if (num_threads == 0) { + TIME(write_single_thread_1(buffer, input, N, ind1, use_avx), *elapsed_ns) + } else { + TIME(write_multi_thread_1(buffer, input, N, ind1, num_threads, use_avx), + *elapsed_ns) + } + return; + } + + assert(ind1 != NULL); + assert(ind2 != NULL); + + if (num_threads == 0) { + TIME(write_single_thread_2(buffer, input, N, ind1, ind2, use_avx), + *elapsed_ns) + } else { + TIME(write_multi_thread_2(buffer, input, N, ind1, ind2, num_threads, + use_avx), + *elapsed_ns) + } + +#endif +} + +// lower and upper bounds are INCLUSIVE +size_t irand(size_t lower, size_t upper) { + assert(upper >= lower); + // drand48 returns a double in the range[0,1.0) (inclusive 0, exclusive 1), so + // multiply that by (upper - lower + 1) so that after truncating, we get an + // integer in the range [lower, upper] (both bounds inclusive) + size_t res = lower + (size_t)(drand48() * (double)(upper - lower + 1)); + assert((res >= lower) && (res <= upper)); + return res; +} + +#ifdef SINGLE_ALLOC +// don't do anything +#define RES_FREE + +#define INPUT_MALLOC +#define INPUT_FREE + +#define TMP_MALLOC +#define TMP_FREE + +#else +// do malloc and free +#define RES_FREE shm_free(res); + +#define INPUT_MALLOC double *input = (double *)shm_malloc(N * sizeof(double)); +#define INPUT_FREE shm_free(input); + +// this doesn't need to be in shared memory +#define TMP_MALLOC size_t *tmp = (size_t *)malloc(4 * N * sizeof(size_t)); +#define TMP_FREE free(tmp); + +#endif + +// return values: +// 0: read and write passed +// 1: read failed +// 2: write failed +#define CHECK_IMPL(ind1, ind2, IDX) \ + double *res = \ + read_data(data, N, ind1, ind2, time_read, num_threads, use_avx); \ + for (size_t i = 0; i < N; ++i) { \ + if (res[i] != data[IDX]) { \ + return 1; \ + } \ + } \ + RES_FREE; \ + \ + INPUT_MALLOC; \ + memset(input, 0, N * sizeof(double)); \ + \ + /* since we could have aliases, we can't compare data to input, since */ \ + /* some inputs could be overwritten by other inputs, and we also don't */ \ + /* have a guaranteed order in which aliased entries are written, so for */ \ + /* an aliased index, there are multiple valid values. First get alias */ \ + /* count and then make a list of valid values for each index. */ \ + TMP_MALLOC; \ + memset(tmp, 0, 4 * N * sizeof(size_t)); \ + size_t *alias_cnt = tmp + 0 * N; \ + size_t *start_idx = tmp + 1 * N; \ + size_t *curr_idx = tmp + 2 * N; \ + size_t *src_idxs = tmp + 3 * N; \ + \ + for (size_t i = 0; i < N; ++i) { \ + input[i] = (double)i; \ + alias_cnt[IDX] += 1; \ + } \ + write_data(data, N, input, ind1, ind2, time_write, num_threads, use_avx); \ + \ + start_idx[0] = 0; \ + for (size_t i = 1; i < N; ++i) { \ + start_idx[i] = start_idx[i - 1] + alias_cnt[i - 1]; \ + } \ + for (size_t i = 0; i < N; ++i) { \ + size_t idx = IDX; \ + src_idxs[start_idx[idx] + curr_idx[idx]] = i; \ + curr_idx[idx] += 1; \ + } \ + \ + int ret = 0; \ + for (size_t i = 0; i < N; ++i) { \ + /* check that the value in data matches one of the possible input */ \ + /* values */ \ + size_t cnt = alias_cnt[i]; \ + if (cnt == 0) { \ + continue; \ + } \ + \ + bool match_found = false; \ + for (size_t j = start_idx[i]; j < start_idx[i] + cnt; ++j) { \ + if (data[i] == input[src_idxs[j]]) { \ + match_found = true; \ + break; \ + } \ + } \ + \ + if (!match_found) { \ + ret = 2; \ + break; \ + } \ + } \ + \ + INPUT_FREE; \ + TMP_FREE; \ + return ret; + +int check_0_level(double *data, size_t N, uint64_t *time_read, + uint64_t *time_write, size_t num_threads, bool use_avx) { + CHECK_IMPL(NULL, NULL, i) +} + +int check_1_level(double *data, size_t N, const size_t *ind, + uint64_t *time_read, uint64_t *time_write, size_t num_threads, + bool use_avx) { + CHECK_IMPL(ind, NULL, ind[i]) +} + +int check_2_level(double *data, size_t N, const size_t *ind1, + const size_t *ind2, uint64_t *time_read, uint64_t *time_write, + size_t num_threads, bool use_avx) { + CHECK_IMPL(ind1, ind2, ind2[ind1[i]]) +} + +// shuffle in-place the given indices from indices[0] to indices[N-1] +// using the modern Fisher-Yates shuffle (see +// https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle and +// https://stackoverflow.com/a/10072899) +void shuffle(size_t *indices, size_t N) { + if (N <= 1) { + return; + } + + for (size_t i = 0; i < N - 1; ++i) { + size_t last_idx = N - 1 - i; + size_t index = irand(0, last_idx); + size_t temp = indices[index]; + indices[index] = indices[last_idx]; + indices[last_idx] = temp; + } +} + +void clustered_shuffle(size_t *indices, size_t N, size_t cluster_size) { + size_t num_clusters = (N + cluster_size - 1) / cluster_size; // round up + for (size_t c = 0; c < num_clusters; ++c) { + size_t cluster_start = c * cluster_size; + size_t cluster_end = cluster_start + cluster_size; + size_t this_cluster_size = + cluster_end < N ? cluster_size : N - cluster_start; + shuffle(indices + cluster_start, this_cluster_size); + } +} + +void add_aliases(size_t *indices, size_t N, double alias_fraction) { + for (size_t i = 0; i < N; ++i) { + if (drand48() < alias_fraction) { + // this index will be aliased, insert it at a random location + size_t idx = irand(0, N - 1); + indices[idx] = indices[i]; + } + } +} + +void add_clustered_aliases(size_t *indices, size_t N, double alias_fraction, + size_t cluster_size) { + size_t num_clusters = (N + cluster_size - 1) / cluster_size; // round up + for (size_t c = 0; c < num_clusters; ++c) { + size_t cluster_start = c * cluster_size; + size_t cluster_end = cluster_start + cluster_size; + size_t this_cluster_size = + cluster_end < N ? cluster_size : N - cluster_start; + add_aliases(indices + cluster_start, this_cluster_size, alias_fraction); + } +} + +bool report(const char *name, int result) { + (void)name; + // printf("%30s: %s\n", name, + // result == 0 + // ? "pass" + // : result == 1 ? "FAIL read" + // : result == 2 ? "FAIL write" : "FAIL unknown"); + + return (result == 0); +} + +void reset(double *data, size_t *ind1, size_t *ind2, size_t N) { + for (size_t i = 0; i < N; ++i) { + data[i] = (double)i; + ind1[i] = i; + ind2[i] = i; + } +} + +#define NUM_TESTS 11 + +bool run_test_suite(size_t N, size_t cluster_size, double alias_fraction, + size_t num_threads, bool use_avx, uint64_t *time_read, + uint64_t *time_write) { + // initialize random number generator, use a specific seed to make every run + // of the test suite use the same indirection + srand48(42); + + // things to test: + // - no indirection + // - 1 level of indirection + // . straight access + // . permutation (1-to-1), random and clustered + // . with aliases, random and clustered + // - 2 levels of indirection, same as above + +#ifndef SINGLE_ALLOC + double *data = (double *)shm_malloc(N * sizeof(double)); + size_t *ind1 = (size_t *)shm_malloc(N * sizeof(size_t)); + size_t *ind2 = (size_t *)shm_malloc(N * sizeof(size_t)); +#endif + + bool all_pass = true; + + // No indirection + reset(data, ind1, ind2, N); + all_pass &= report("No indirection", + check_0_level(data, N, time_read + 0, time_write + 0, + num_threads, use_avx)); + + // 1 level of indirection + + // straight access + reset(data, ind1, ind2, N); + all_pass &= report("1-lev straight", + check_1_level(data, N, ind1, time_read + 1, time_write + 1, + num_threads, use_avx)); + + // permutation (no aliases) + reset(data, ind1, ind2, N); + shuffle(ind1, N); + all_pass &= report("1-lev full shuffle no alias", + check_1_level(data, N, ind1, time_read + 2, time_write + 2, + num_threads, use_avx)); + + reset(data, ind1, ind2, N); + clustered_shuffle(ind1, N, cluster_size); + all_pass &= report("1-lev clustered no alias", + check_1_level(data, N, ind1, time_read + 3, time_write + 3, + num_threads, use_avx)); + + // with aliases + reset(data, ind1, ind2, N); + add_aliases(ind1, N, alias_fraction); + shuffle(ind1, N); + all_pass &= report("1-lev full shuffle with alias", + check_1_level(data, N, ind1, time_read + 4, time_write + 4, + num_threads, use_avx)); + + reset(data, ind1, ind2, N); + add_clustered_aliases(ind1, N, alias_fraction, cluster_size); + clustered_shuffle(ind1, N, cluster_size); + all_pass &= report("1-lev clustered with alias", + check_1_level(data, N, ind1, time_read + 5, time_write + 5, + num_threads, use_avx)); + + // 2 level of indirection + + // straight access + reset(data, ind1, ind2, N); + all_pass &= report("2-lev straight", + check_2_level(data, N, ind1, ind2, time_read + 6, + time_write + 6, num_threads, use_avx)); + + // permutation (no aliases) + reset(data, ind1, ind2, N); + shuffle(ind1, N); + shuffle(ind2, N); + all_pass &= report("2-lev full shuffle no alias", + check_2_level(data, N, ind1, ind2, time_read + 7, + time_write + 7, num_threads, use_avx)); + + reset(data, ind1, ind2, N); + clustered_shuffle(ind1, N, cluster_size); + clustered_shuffle(ind2, N, cluster_size); + all_pass &= report("2-lev clustered no alias", + check_2_level(data, N, ind1, ind2, time_read + 8, + time_write + 8, num_threads, use_avx)); + + // with aliases + reset(data, ind1, ind2, N); + add_aliases(ind1, N, alias_fraction); + add_aliases(ind2, N, alias_fraction); + shuffle(ind1, N); + shuffle(ind2, N); + all_pass &= report("2-lev full shuffle with alias", + check_2_level(data, N, ind1, ind2, time_read + 9, + time_write + 9, num_threads, use_avx)); + + reset(data, ind1, ind2, N); + add_clustered_aliases(ind1, N, alias_fraction, cluster_size); + add_clustered_aliases(ind2, N, alias_fraction, cluster_size); + clustered_shuffle(ind1, N, cluster_size); + clustered_shuffle(ind2, N, cluster_size); + all_pass &= report("2-lev clustered with alias", + check_2_level(data, N, ind1, ind2, time_read + 10, + time_write + 10, num_threads, use_avx)); + +#ifndef SINGLE_ALLOC + shm_free(data); + shm_free(ind1); + shm_free(ind2); +#endif + + return all_pass; +} + +void benchmark(size_t N, size_t cluster_size, double alias_fraction, + size_t num_threads, bool use_avx) { + size_t num_runs = 5; + size_t ignore_first_num = 1; + + bool all_pass = true; + uint64_t time_read[NUM_TESTS], time_read_sum[NUM_TESTS]; + uint64_t time_write[NUM_TESTS], time_write_sum[NUM_TESTS]; + + for (size_t j = 0; j < NUM_TESTS; ++j) { + time_read[j] = 0; + time_write[j] = 0; + time_read_sum[j] = 0; + time_write_sum[j] = 0; + } + + for (size_t i = 0; i < num_runs; ++i) { + for (size_t j = 0; j < NUM_TESTS; ++j) { + time_read[j] = 0; + time_write[j] = 0; + } + + all_pass &= run_test_suite(N, cluster_size, alias_fraction, num_threads, + use_avx, time_read, time_write); + + if (i >= ignore_first_num) { + for (size_t j = 0; j < NUM_TESTS; ++j) { + time_read_sum[j] += time_read[j]; + time_write_sum[j] += time_write[j]; + } + } + } + + printf("%8zu ", num_threads); + // we want to compute bandwidth in MiB/s, multiply data by number of tests + // timed + double bw_mult = (double)(N * sizeof(double) * (num_runs - ignore_first_num)); + // now divide to get GiB and multiply by 1e9 because time is in ns + bw_mult *= 1e9 / (1024.0 * 1024.0 * 1024.0); + + uint64_t total_read = 0; + uint64_t total_write = 0; + for (size_t j = 0; j < NUM_TESTS; ++j) { + total_read += time_read_sum[j]; + total_write += time_write_sum[j]; + printf("%4.1f | %4.1f ", bw_mult / (double)time_read_sum[j], + bw_mult / (double)time_write_sum[j]); + } + printf("%4.1f | %4.1f %s\n", NUM_TESTS * bw_mult / (double)total_read, + NUM_TESTS * bw_mult / (double)total_write, + all_pass ? "all pass" : "some FAILED"); +} + +void run_benchmarks(size_t N, size_t cluster_size, double alias_fraction, + bool use_avx) { + printf("\nRunning tests %s AVX intrinsics\n", use_avx ? "with" : "WITHOUT"); + const char *names[NUM_TESTS] = {"0-str", "1-str", "1-FnoA", "1-CnoA", + "1-FA", "1-CA", "2-str", "2-FnoA", + "2-CnoA", "2-FA", "2-CA"}; + printf("%8s ", "Threads"); + for (size_t j = 0; j < NUM_TESTS; ++j) { + printf("%11s ", names[j]); + } + printf("%11s\n", "Total"); + + benchmark(N, cluster_size, alias_fraction, 0, use_avx); + benchmark(N, cluster_size, alias_fraction, 1, use_avx); + benchmark(N, cluster_size, alias_fraction, 2, use_avx); + benchmark(N, cluster_size, alias_fraction, 4, use_avx); + benchmark(N, cluster_size, alias_fraction, 8, use_avx); + benchmark(N, cluster_size, alias_fraction, 16, use_avx); + benchmark(N, cluster_size, alias_fraction, 24, use_avx); + benchmark(N, cluster_size, alias_fraction, 32, use_avx); + benchmark(N, cluster_size, alias_fraction, 48, use_avx); +} + +int main(int argc, char **argv) { + // Suppress Compiler Warnings + (void)argc; + (void)argv; + + size_t N = 1024 * 1024; + size_t cluster_size = 32; + double alias_fraction = 0.1; + +#ifdef USE_CLIENT + printf("Running tests using the memory controller, which must be started " + "before this executable is run\n"); + client.chatty = 0; + init(&client); +#else + printf("Running tests WITHOUT using the memory controller\n"); +#endif + +#ifdef SINGLE_ALLOC + data = (double *)shm_malloc(N * sizeof(double)); + res = (double *)shm_malloc(N * sizeof(double)); + buffer = (double *)shm_malloc(N * sizeof(double)); + input = (double *)shm_malloc(N * sizeof(double)); + ind1 = (size_t *)shm_malloc(N * sizeof(size_t)); + ind2 = (size_t *)shm_malloc(N * sizeof(size_t)); + + // doesn't have to be shared memory + tmp = (size_t *)malloc(4 * N * sizeof(size_t)); +#endif + + printf( + "Benchmark results (average read | write bandwidth in GiB/s), N = %zu\n", + N); + printf(" 0|1|2: number of levels of indirection\n"); + printf(" str: straight access\n"); + printf(" F|C: full or clustered shuffle\n"); + printf(" A|noA: with or without aliases\n\n"); + + run_benchmarks(N, cluster_size, alias_fraction, false); + run_benchmarks(N, cluster_size, alias_fraction, true); + +#ifdef USE_CLIENT + // send quit request + // struct request quit_req; + // quit_req.client = client_id; + // quit_req.r_type = Quit; + // quit_req.id = ++req_id; + // scoria_put_request(&client, &quit_req); + // wait_request(&client, &quit_req); + + cleanup(&client); +#endif + +#ifdef SINGLE_ALLOC + shm_free(data); + shm_free(res); + shm_free(buffer); + shm_free(input); + shm_free(ind1); + shm_free(ind2); + + free(tmp); +#endif + + return 0; +} diff --git a/tests/test_spatter.c b/tests/test_spatter.c new file mode 100644 index 0000000..a8cd1f3 --- /dev/null +++ b/tests/test_spatter.c @@ -0,0 +1,180 @@ +#include + +#include "client.h" +#include "client_cleanup.h" +#include "client_init.h" +#include "client_memory.h" +#include "client_place_requests.h" +#include "config.h" +#include "parse-args.h" +#include "shm_malloc.h" + +#define SP_MAX_ALLOC (65 * 1000 * 1000 * 1000) + +size_t remap_pattern(const int nrc, unsigned long *pattern, + const size_t pattern_len); + +int main(int argc, char **argv) { + struct run_config *rc; + int nrc = 0; + + parse_args(argc, argv, &nrc, &rc); + + if (nrc <= 0) { + printf("No run configurations parsed\n"); + return 1; + } + + if (rc[0].kernel != GATHER && rc[0].kernel != SCATTER && rc[0].kernel != GS) { + printf("Error: Unsupported kernel\n"); + exit(1); + } + + size_t max_pattern_val = 0; + + for (int i = 0; i < nrc; i++) + max_pattern_val = remap_pattern(nrc, rc[i].pattern, rc[i].pattern_len); + + printf("max pattern val: %d\n", max_pattern_val); + + struct client client; + client.chatty = 1; + init(&client); + + for (int i = 0; i < nrc; i++) { + for (int j = -1; j <= (int)rc[i].nruns; j++) { + size_t N = rc[i].pattern_len; + + double *res = (double *)shm_malloc(max_pattern_val * sizeof(double)); + double *input = (double *)shm_malloc(max_pattern_val * sizeof(double)); + + size_t *pattern = shm_malloc(N * sizeof(size_t)); + + memset(res, 0, max_pattern_val * sizeof(double)); + + for (size_t k = 0; k < max_pattern_val; k++) + input[k] = (double)(2 * k); + + for (size_t k = 0; k < N; k++) + pattern[k] = (size_t)rc[i].pattern[k]; + + switch (rc[i].kernel) { + case SCATTER: + printf("Scatter with Length: %d\n", N); + + struct request write_req; + scoria_write(&client, res, N, input, pattern, NULL, 0, 0, &write_req); + wait_request(&client, &write_req); + + break; + case GATHER: + printf("Gather with Length: %d\n", N); + + struct request read_req; + scoria_read(&client, input, N, res, pattern, NULL, 0, 0, &read_req); + wait_request(&client, &read_req); + + break; + default: + printf("Error: Unable to determine kernel\n"); + break; + } + + shm_free(res); + shm_free(input); + shm_free(pattern); + } + } + + cleanup(&client); + + return 0; +} + +size_t remap_pattern(const int nrc, unsigned long *pattern, + const size_t pattern_len) { + size_t max_pattern_val = pattern[0]; + + for (size_t j = 0; j < pattern_len; j++) { + if (pattern[j] > max_pattern_val) { + max_pattern_val = pattern[j]; + } + } + + // Post-Processing to make heap values fit + size_t boundary = (((SP_MAX_ALLOC - 1) / sizeof(sgData_t)) / nrc) / 2; + // printf("Boundary: %zu, max_pattern_val: %zu, difference: %zu\n", boundary, + // max_pattern_val, max_pattern_val - boundary); + + if (max_pattern_val >= boundary) { + // printf("Inside of boundary if statement\n"); + int outside_boundary = 0; + for (size_t j = 0; j < pattern_len; j++) { + if (pattern[j] >= boundary) { + outside_boundary++; + } + } + + // printf("Configuration: %d, Number of indices outside of boundary: %d, + // Total pattern length: %d, Frequency of outside of boundary indices: + // %.2f\n", i, outside_boundary, rc[i].pattern_len, (double)outside_boundary + // / (double)rc[i].pattern_len ); + + // Initialize map to sentinel value of -1 + size_t heap_map[outside_boundary][2]; + for (size_t j = 0; j < outside_boundary; j++) { + heap_map[j][0] = -1; + heap_map[j][1] = -1; + } + + int pos = 0; + for (size_t j = 0; j < pattern_len; j++) { + if (pattern[j] >= boundary) { + // Search if exists in map + int found = 0; + for (size_t k = 0; k < pos; k++) { + if (heap_map[k][0] == pattern[j]) { + // printf("Already found %zu at position %zu\n", rc[i].pattern[j], + // k); + found = 1; + break; + } + } + + // If not found, add to map + if (!found) { + // printf("Inserting %zu, %zu into the heap map at position %zu\n", + // rc[i].pattern[j], boundary - pos, pos); + heap_map[pos][0] = pattern[j]; + heap_map[pos][1] = boundary - pos; + pos++; + } + } + } + + for (size_t j = 0; j < pattern_len; j++) { + if (pattern[j] >= boundary) { + // Find entry in map + int idx = -1; + for (size_t k = 0; k < pos; k++) { + if (heap_map[k][0] == pattern[j]) { + // printf("Found at index: %d\n", k); + + // Map heap address to new address inside of boundary + pattern[j] = heap_map[k][1]; + break; + } + } + } + } + + max_pattern_val = pattern[0]; + for (size_t j = 0; j < pattern_len; j++) { + if (pattern[j] > max_pattern_val) { + max_pattern_val = pattern[j]; + } + } + } + + return max_pattern_val; +}